def execute(filelocation, args, outdir, filters=None, executable='msConvert.exe'): """Execute the msConvert tool on Windows operating systems. :param filelocation: input file path :param args: str() or list(), msConvert arguments for details see the msConvert help below. :param outdir: path of the output directory :param filters: str() or list(), specify additional parameters and filters, for details see the msConvert help below. :param executable: must specify the complete file path of the msConvert.exe if its location is not in the ``PATH`` environment variable. """ procArgs = [executable, filelocation] procArgs.extend(aux.toList(args)) if filters is not None: for arg in aux.toList(filters): procArgs.extend(['--filter', arg]) procArgs.extend(['-o', outdir]) ## run it ## proc = subprocess.Popen(procArgs, stderr=subprocess.PIPE) ## But do not wait till netstat finish, start displaying output immediately ## while True: out = proc.stderr.read(1) if out == '' and proc.poll() != None: break if out != '': sys.stdout.write(out) sys.stdout.flush()
def getArrays(self, attr=None, sort=False, reverse=False, selector=None, defaultValue=None, report='lfq'): """ #TODO: docstring """ selector = (lambda fgi: fgi.isValid) if selector is None else selector attr = attr if attr is not None else [] attr = set(['id', 'intensities'] + aux.toList(attr)) items = self.getItems(sort, reverse, selector) arrays = _getArrays(items, attr, defaultValue) for specfile in self._matrixTemplate: arrays[specfile] = list() for intensities in arrays['intensities']: for specfile, intensitiy in zip(self._matrixTemplate, intensities): arrays[specfile].append(intensitiy) for specfile in self._matrixTemplate: arrays[specfile] = numpy.array(arrays[specfile], dtype=numpy.float64) del arrays['intensities'] return arrays
def execute(filelocation, outpath, executable, args=None, switchArgs=None): """Executes the dinosaur tool on Windows operating systems. :param filelocation: either a single mgf file path or a list of file paths. :param outpath: path of the output file, file must not exist :param executable: must specify the complete file path of the spectra-cluster-cli.jar file, supported version is 1.0.2 BETA. :param args: list of arguments containing a value, for details see the spectra-cluster-cli help. Arguments should be added as tuples or a list. For example: [('precursor_tolerance', '0.5'), ('rounds', '3')] :param switchArgs: list of arguments not containing a value, for details see the spectra-cluster-cli help. Arguments should be added as strings. For example: ['fast_mode', 'keep_binary_files'] """ procArgs = ['java', '-jar', executable] procArgs.extend(['-output_path', outpath]) if args is not None: for arg in args: procArgs.extend(['-' + arg[0], arg[1]]) if switchArgs is not None: procArgs.extend(['-' + arg for arg in switchArgs]) procArgs.extend(aux.toList(filelocation)) ## run it ## proc = subprocess.Popen(procArgs, stderr=subprocess.PIPE) ## But do not wait till netstat finish, start displaying output immediately ## while True: out = proc.stderr.read(1) if out == '' and proc.poll() != None: break if out != '': sys.stdout.write(out) sys.stdout.flush()
def expectedLabelPosition(peptide, labelStateInfo, sequence=None, modPositions=None): """Returns a modification description of a certain label state of a peptide. :param peptide: Peptide sequence used to calculat the expected label state modifications :param labelStateInfo: An entry of :attr:`LabelDescriptor.labels` that describes a label state :param sequence: unmodified amino acid sequence of :var:`peptide`, if None it is generated by :func:`maspy.peptidemethods.removeModifications()` :param modPositions: dictionary describing the modification state of "peptide", if None it is generated by :func:`maspy.peptidemethods.returnModPositions()` :returns: {sequence position: sorted list of expected label modifications on that position, ... } """ if modPositions is None: modPositions = maspy.peptidemethods.returnModPositions(peptide, indexStart=0 ) if sequence is None: sequence = maspy.peptidemethods.removeModifications(peptide) currLabelMods = dict() for labelPosition, labelSymbols in viewitems(labelStateInfo['aminoAcidLabels']): labelSymbols = aux.toList(labelSymbols) if labelSymbols == ['']: pass elif labelPosition == 'nTerm': currLabelMods.setdefault(0, list()) currLabelMods[0].extend(labelSymbols) else: for sequencePosition in aux.findAllSubstrings(sequence, labelPosition): currLabelMods.setdefault(sequencePosition, list()) currLabelMods[sequencePosition].extend(labelSymbols) if labelStateInfo['excludingModifications'] is not None: for excludingMod, excludedLabelSymbol in viewitems(labelStateInfo['excludingModifications']): if excludingMod not in modPositions: continue for excludingModPos in modPositions[excludingMod]: if excludingModPos not in currLabelMods: continue if excludedLabelSymbol not in currLabelMods[excludingModPos]: continue if len(currLabelMods[excludingModPos]) == 1: del(currLabelMods[excludingModPos]) else: excludedModIndex = currLabelMods[excludingModPos].index(excludedLabelSymbol) currLabelMods[excludingModPos].pop(excludedModIndex) for sequencePosition in list(viewkeys(currLabelMods)): currLabelMods[sequencePosition] = sorted(currLabelMods[sequencePosition]) return currLabelMods
def _addProteinIdsToGroupMapping(self, proteinIds, groupId): """Add a groupId to one or multiple entries of the internal proteinToGroupId mapping. :param proteinIds: a proteinId or a list of proteinIds, a proteinId must be a string. :param groupId: str, a groupId """ for proteinId in AUX.toList(proteinIds): self._proteinToGroupIds[proteinId].add(groupId)
def addSubsumableToGroups(self, proteinIds, groupIds): """Add one or multiple subsumable proteins to one or multiple protein groups. :param proteinIds: a proteinId or a list of proteinIds, a proteinId must be a string. :param groupIds: a groupId or a list of groupIds, a groupId must be a string. """ for groupId in AUX.toList(groupIds): self.groups[groupId].addSubsumableProteins(proteinIds) self._addProteinIdsToGroupMapping(proteinIds, groupId)
def _addProteins(self, proteinIds, containerNames): """Add one or multiple proteinIds to the respective container. :param proteinIds: a proteinId or a list of proteinIds, a proteinId must be a string. :param containerNames: list, entries must be one or multiple of 'leading', 'subset', 'subsumableProteins' or 'proteins' :param addToProteins: bool, if True the proteinIds are added to the """ proteinIds = AUX.toList(proteinIds) for containerName in containerNames: proteinContainer = getattr(self, containerName) proteinContainer.update(proteinIds)
def modAminoacidsFromLabelInfo(labelDescriptor): """Returns a set of all amino acids and termini which can bear a label, as described in "labelDescriptor". :param labelDescriptor: :class:`LabelDescriptor` describes the label setup of an experiment :returns: #TODO: docstring """ modAminoacids = set() for labelStateEntry in viewvalues(labelDescriptor.labels): for labelPositionEntry in viewkeys(labelStateEntry['aminoAcidLabels']): for modAminoacid in aux.toList(labelPositionEntry): if modAminoacid != '': modAminoacids.add(modAminoacid) return modAminoacids
def modSymbolsFromLabelInfo(labelDescriptor): """Returns a set of all modiciation symbols which were used in the labelDescriptor :param labelDescriptor: :class:`LabelDescriptor` describes the label setup of an experiment :returns: #TODO: docstring """ modSymbols = set() for labelStateEntry in viewvalues(labelDescriptor.labels): for labelPositionEntry in viewvalues(labelStateEntry['aminoAcidLabels']): for modSymbol in aux.toList(labelPositionEntry): if modSymbol != '': modSymbols.add(modSymbol) return modSymbols
def modSymbolsFromLabelInfo(labelDescriptor): """Returns a set of all modiciation symbols which were used in the labelDescriptor :param labelDescriptor: :class:`LabelDescriptor` describes the label setup of an experiment :returns: #TODO: docstring """ modSymbols = set() for labelStateEntry in viewvalues(labelDescriptor.labels): for labelPositionEntry in viewvalues( labelStateEntry['aminoAcidLabels']): for modSymbol in aux.toList(labelPositionEntry): if modSymbol != '': modSymbols.add(modSymbol) return modSymbols
def getArrays(self, attr=None, sort=False, reverse=False, selector=None, defaultValue=None, report='lfq'): """ #TODO: docstring """ selector = (lambda fgi: fgi.isValid) if selector is None else selector attr = attr if attr is not None else [] attr = set(['id', 'intensities'] + aux.toList(attr)) items = self.getItems(sort, reverse, selector) arrays = _getArrays(items, attr, defaultValue) for specfile in self._matrixTemplate: arrays[specfile] = list() for intensities in arrays['intensities']: for specfile, intensitiy in zip(self._matrixTemplate, intensities): arrays[specfile].append(intensitiy) for specfile in self._matrixTemplate: arrays[specfile] = numpy.array(arrays[specfile], dtype=numpy.float64 ) del arrays['intensities'] return arrays
def test_toList(self): self.assertEqual(MODULE.toList((1, 2, 3, 'A')), (1, 2, 3, 'A')) self.assertEqual(MODULE.toList('A'), ['A']) self.assertEqual(MODULE.toList(123), [123])
def mappingBasedGrouping(protToPeps): """Performs protein grouping based only on protein to peptide mappings. :param protToPeps: dict, for each protein (=key) contains a set of associated peptides (=value). For Example {protein: {peptide, ...}, ...} #TODO: REFACTORING!!! returns a ProteinInference object """ inference = ProteinInference(protToPeps) pepToProts = inference.pepToProts proteinClusters = _findProteinClusters(protToPeps, pepToProts) proteins = {} for clusterId, proteinCluster in enumerate(proteinClusters, 1): clusterProtToPeps = {p: protToPeps[p] for p in proteinCluster} #Find sameset proteins, define unique and non unique sameset proteins #NOTE: already unique proteins could be excluded to find sameset proteins samesetProteins = _findSamesetProteins(clusterProtToPeps) mergedProtToPeps = _mergeProteinEntries(samesetProteins, clusterProtToPeps) mergedPepToProts = _invertMapping(mergedProtToPeps) uniqueProteins = _findUniqueMappingValues(mergedPepToProts) remainingProteins = set(mergedProtToPeps).difference(uniqueProteins) # Remove subset proteins and check if remaining proteins become unique subsetProteinInfo = _findSubsetProteins(remainingProteins, mergedProtToPeps, mergedPepToProts) subsetProteins = [p for p, _ in subsetProteinInfo] subsetRemovedProtToPeps = _reducedProtToPeps(mergedProtToPeps, subsetProteins) subsetRemovedPepToProts = _invertMapping(subsetRemovedProtToPeps) uniqueSubsetRemoved = _findUniqueMappingValues(subsetRemovedPepToProts) remainingProteins = remainingProteins.difference(subsetProteins) remainingProteins = remainingProteins.difference(uniqueSubsetRemoved) # Find redundant proteins # subsumableProteins = _findRedundantProteins(subsetRemovedProtToPeps, subsetRemovedPepToProts) remainingNonRedundant = remainingProteins.difference( subsumableProteins) groupInitiatingProteins = uniqueSubsetRemoved.union( remainingNonRedundant) # - Generate protein groups and assign proteins to groups - # #Generate protein groups clusterGroupIds = set() for protein in groupInitiatingProteins: proteinIds = AUX.toList(protein) groupId = inference.addProteinGroup(proteinIds[0]) inference.addLeadingToGroups(proteinIds, groupId) clusterGroupIds.add(groupId) #Add redundant proteins here (must be subsumable I guess) for protein in subsumableProteins: proteinIds = AUX.toList(protein) connectedProteins = _mappingGetValueSet(mergedPepToProts, mergedProtToPeps[protein]) flatConnectedProteins = _flattenMergedProteins(connectedProteins) groupIds = _mappingGetValueSet(inference._proteinToGroupIds, flatConnectedProteins) inference.addSubsumableToGroups(proteinIds, groupIds) assert len(groupIds) > 1 #Add subgroup proteins to the respective groups #NOTE: proteins that are only a subset of subsumable proteins are not #to be added as subset proteins to a group but as subsumable proteins. for protein, supersetProteins in subsetProteinInfo: proteinIds = AUX.toList(protein) #If the protein is a subset of at least one protein, that is not a #subsumable protein, then it should be added to the group as subset. leadingSuperProteins = supersetProteins.intersection( groupInitiatingProteins) if leadingSuperProteins: flatSupersetProteins = _flattenMergedProteins( leadingSuperProteins) superGroupIds = _mappingGetValueSet( inference._proteinToGroupIds, flatSupersetProteins) inference.addSubsetToGroups(proteinIds, superGroupIds) #However, if all its super proteins are subsumable, the protein #itself is a subsumable protein. else: flatSupersetProteins = _flattenMergedProteins(supersetProteins) superGroupIds = _mappingGetValueSet( inference._proteinToGroupIds, flatSupersetProteins) inference.addSubsumableToGroups(proteinIds, superGroupIds) subsumableProteins.update(proteinIds) assert superGroupIds # - Define peptide properties - # groupToPeps = dict() allSubsumablePeps = set() for groupId in clusterGroupIds: group = inference.groups[groupId] if group.subsumableProteins: subsumablePeptides = _mappingGetValueSet( protToPeps, group.subsumableProteins) allSubsumablePeps.update(subsumablePeptides) groupPeptides = _mappingGetValueSet(protToPeps, group.proteins) groupToPeps[groupId] = groupPeptides pepToGroups = _invertMapping(groupToPeps) #Get unique peptides from peptide to protein mapping uniquePeptides = _findUniqueMappingKeys(mergedPepToProts) #Shared peptides have a groupPeptideCount > 1 nonSharedPeptides = _findUniqueMappingKeys(pepToGroups) sharedPeptides = set(pepToGroups).difference(nonSharedPeptides) #Subsumable peptides are peptides from subsumable proteins that #are not shared peptides of multiple groups subsumablePeptides = allSubsumablePeps.difference(sharedPeptides) #groupUniquePeptides are the remaining ones (not shared with subsumable #proteins, groupPeptideCount == 1, not unique peptides) groupUniquePeptides = nonSharedPeptides.difference(subsumablePeptides) groupUniquePeptides = groupUniquePeptides.difference(uniquePeptides) inference._uniquePeptides.update(uniquePeptides) inference._groupUniquePeptides.update(groupUniquePeptides) inference._groupSubsumablePeptides.update(subsumablePeptides) inference._sharedPeptides.update(sharedPeptides) # - Generate protein entries and add them to the inference object - # subsetProteinInfoDict = dict(subsetProteinInfo) for protein, peptides in viewitems(mergedProtToPeps): _uniquePeptides = peptides.intersection(uniquePeptides) _groupUniquePeptides = peptides.intersection(groupUniquePeptides) _subsumablePeptides = peptides.intersection(subsumablePeptides) _sharedPeptides = peptides.intersection(sharedPeptides) proteinIds = AUX.toList(protein) for proteinId in proteinIds: proteinEntry = Protein(proteinId, peptides) if protein in groupInitiatingProteins: proteinEntry.isLeading = True elif protein in subsumableProteins: proteinEntry.isSubsumable = True if protein in subsetProteins: superset = subsetProteinInfoDict[protein] proteinEntry.isSubset = _flattenMergedProteins(superset) if len(proteinIds) > 1: proteinEntry.isSameset = set(proteinIds) inference.proteins[proteinId] = proteinEntry #Add peptides to protein entry proteinEntry.uniquePeptides = _uniquePeptides proteinEntry.groupUniquePeptides = _groupUniquePeptides proteinEntry.groupSubsumablePeptides = _subsumablePeptides proteinEntry.sharedPeptides = _sharedPeptides # - Save cluster information - # for proteinId in proteinCluster: inference._proteinToClusterId[proteinId] = clusterId inference.clusters[clusterId] = clusterGroupIds allProteins = set() for proteinGroup in viewvalues(inference.groups): allProteins.update(proteinGroup.proteins) allProteins.update(proteinGroup.subsumableProteins) assert len(allProteins) == len(protToPeps) return inference
def mappingBasedGrouping(protToPeps): """Performs protein grouping based only on protein to peptide mappings. :param protToPeps: dict, for each protein (=key) contains a set of associated peptides (=value). For Example {protein: {peptide, ...}, ...} #TODO: REFACTORING!!! returns a ProteinInference object """ inference = ProteinInference(protToPeps) pepToProts = inference.pepToProts proteinClusters = _findProteinClusters(protToPeps, pepToProts) proteins = {} for clusterId, proteinCluster in enumerate(proteinClusters, 1): clusterProtToPeps = {p: protToPeps[p] for p in proteinCluster} #Find sameset proteins, define unique and non unique sameset proteins #NOTE: already unique proteins could be excluded to find sameset proteins samesetProteins = _findSamesetProteins(clusterProtToPeps) mergedProtToPeps = _mergeProteinEntries(samesetProteins, clusterProtToPeps) mergedPepToProts = _invertMapping(mergedProtToPeps) uniqueProteins = _findUniqueMappingValues(mergedPepToProts) remainingProteins = set(mergedProtToPeps).difference(uniqueProteins) # Remove subset proteins and check if remaining proteins become unique subsetProteinInfo = _findSubsetProteins(remainingProteins, mergedProtToPeps, mergedPepToProts) subsetProteins = [p for p, _ in subsetProteinInfo] subsetRemovedProtToPeps = _reducedProtToPeps(mergedProtToPeps, subsetProteins) subsetRemovedPepToProts = _invertMapping(subsetRemovedProtToPeps) uniqueSubsetRemoved = _findUniqueMappingValues(subsetRemovedPepToProts) remainingProteins = remainingProteins.difference(subsetProteins) remainingProteins = remainingProteins.difference(uniqueSubsetRemoved) # Find redundant proteins # subsumableProteins = _findRedundantProteins(subsetRemovedProtToPeps, subsetRemovedPepToProts) remainingNonRedundant = remainingProteins.difference(subsumableProteins) groupInitiatingProteins = uniqueSubsetRemoved.union(remainingNonRedundant) # - Generate protein groups and assign proteins to groups - # #Generate protein groups clusterGroupIds = set() for protein in groupInitiatingProteins: proteinIds = AUX.toList(protein) groupId = inference.addProteinGroup(proteinIds[0]) inference.addLeadingToGroups(proteinIds, groupId) clusterGroupIds.add(groupId) #Add redundant proteins here (must be subsumable I guess) for protein in subsumableProteins: proteinIds = AUX.toList(protein) connectedProteins = _mappingGetValueSet( mergedPepToProts, mergedProtToPeps[protein] ) flatConnectedProteins = _flattenMergedProteins(connectedProteins) groupIds = _mappingGetValueSet( inference._proteinToGroupIds, flatConnectedProteins ) inference.addSubsumableToGroups(proteinIds, groupIds) assert len(groupIds) > 1 #Add subgroup proteins to the respective groups #NOTE: proteins that are only a subset of subsumable proteins are not #to be added as subset proteins to a group but as subsumable proteins. for protein, supersetProteins in subsetProteinInfo: proteinIds = AUX.toList(protein) #If the protein is a subset of at least one protein, that is not a #subsumable protein, then it should be added to the group as subset. leadingSuperProteins = supersetProteins.intersection( groupInitiatingProteins) if leadingSuperProteins: flatSupersetProteins = _flattenMergedProteins( leadingSuperProteins) superGroupIds = _mappingGetValueSet( inference._proteinToGroupIds, flatSupersetProteins ) inference.addSubsetToGroups(proteinIds, superGroupIds) #However, if all its super proteins are subsumable, the protein #itself is a subsumable protein. else: flatSupersetProteins = _flattenMergedProteins(supersetProteins) superGroupIds = _mappingGetValueSet( inference._proteinToGroupIds, flatSupersetProteins ) inference.addSubsumableToGroups(proteinIds, superGroupIds) subsumableProteins.update(proteinIds) assert superGroupIds # - Define peptide properties - # groupToPeps = dict() allSubsumablePeps = set() for groupId in clusterGroupIds: group = inference.groups[groupId] if group.subsumableProteins: subsumablePeptides = _mappingGetValueSet( protToPeps, group.subsumableProteins ) allSubsumablePeps.update(subsumablePeptides) groupPeptides = _mappingGetValueSet(protToPeps, group.proteins) groupToPeps[groupId] = groupPeptides pepToGroups = _invertMapping(groupToPeps) #Get unique peptides from peptide to protein mapping uniquePeptides = _findUniqueMappingKeys(mergedPepToProts) #Shared peptides have a groupPeptideCount > 1 nonSharedPeptides = _findUniqueMappingKeys(pepToGroups) sharedPeptides = set(pepToGroups).difference(nonSharedPeptides) #Subsumable peptides are peptides from subsumable proteins that #are not shared peptides of multiple groups subsumablePeptides = allSubsumablePeps.difference(sharedPeptides) #groupUniquePeptides are the remaining ones (not shared with subsumable #proteins, groupPeptideCount == 1, not unique peptides) groupUniquePeptides = nonSharedPeptides.difference(subsumablePeptides) groupUniquePeptides = groupUniquePeptides.difference(uniquePeptides) inference._uniquePeptides.update(uniquePeptides) inference._groupUniquePeptides.update(groupUniquePeptides) inference._groupSubsumablePeptides.update(subsumablePeptides) inference._sharedPeptides.update(sharedPeptides) # - Generate protein entries and add them to the inference object - # subsetProteinInfoDict = dict(subsetProteinInfo) for protein, peptides in viewitems(mergedProtToPeps): _uniquePeptides = peptides.intersection(uniquePeptides) _groupUniquePeptides = peptides.intersection(groupUniquePeptides) _subsumablePeptides = peptides.intersection(subsumablePeptides) _sharedPeptides = peptides.intersection(sharedPeptides) proteinIds = AUX.toList(protein) for proteinId in proteinIds: proteinEntry = Protein(proteinId, peptides) if protein in groupInitiatingProteins: proteinEntry.isLeading = True elif protein in subsumableProteins: proteinEntry.isSubsumable = True if protein in subsetProteins: superset = subsetProteinInfoDict[protein] proteinEntry.isSubset = _flattenMergedProteins(superset) if len(proteinIds) > 1: proteinEntry.isSameset = set(proteinIds) inference.proteins[proteinId] = proteinEntry #Add peptides to protein entry proteinEntry.uniquePeptides = _uniquePeptides proteinEntry.groupUniquePeptides = _groupUniquePeptides proteinEntry.groupSubsumablePeptides = _subsumablePeptides proteinEntry.sharedPeptides = _sharedPeptides # - Save cluster information - # for proteinId in proteinCluster: inference._proteinToClusterId[proteinId] = clusterId inference.clusters[clusterId] = clusterGroupIds allProteins = set() for proteinGroup in viewvalues(inference.groups): allProteins.update(proteinGroup.proteins) allProteins.update(proteinGroup.subsumableProteins) assert len(allProteins) == len(protToPeps) return inference
def expectedLabelPosition(peptide, labelStateInfo, sequence=None, modPositions=None): """Returns a modification description of a certain label state of a peptide. :param peptide: Peptide sequence used to calculat the expected label state modifications :param labelStateInfo: An entry of :attr:`LabelDescriptor.labels` that describes a label state :param sequence: unmodified amino acid sequence of :var:`peptide`, if None it is generated by :func:`maspy.peptidemethods.removeModifications()` :param modPositions: dictionary describing the modification state of "peptide", if None it is generated by :func:`maspy.peptidemethods.returnModPositions()` :returns: {sequence position: sorted list of expected label modifications on that position, ... } """ if modPositions is None: modPositions = maspy.peptidemethods.returnModPositions(peptide, indexStart=0) if sequence is None: sequence = maspy.peptidemethods.removeModifications(peptide) currLabelMods = dict() for labelPosition, labelSymbols in viewitems( labelStateInfo['aminoAcidLabels']): labelSymbols = aux.toList(labelSymbols) if labelSymbols == ['']: pass elif labelPosition == 'nTerm': currLabelMods.setdefault(0, list()) currLabelMods[0].extend(labelSymbols) else: for sequencePosition in aux.findAllSubstrings( sequence, labelPosition): currLabelMods.setdefault(sequencePosition, list()) currLabelMods[sequencePosition].extend(labelSymbols) if labelStateInfo['excludingModifications'] is not None: for excludingMod, excludedLabelSymbol in viewitems( labelStateInfo['excludingModifications']): if excludingMod not in modPositions: continue for excludingModPos in modPositions[excludingMod]: if excludingModPos not in currLabelMods: continue if excludedLabelSymbol not in currLabelMods[excludingModPos]: continue if len(currLabelMods[excludingModPos]) == 1: del (currLabelMods[excludingModPos]) else: excludedModIndex = currLabelMods[excludingModPos].index( excludedLabelSymbol) currLabelMods[excludingModPos].pop(excludedModIndex) for sequencePosition in list(viewkeys(currLabelMods)): currLabelMods[sequencePosition] = sorted( currLabelMods[sequencePosition]) return currLabelMods
def matchToFeatures(fiContainer, specContainer, specfiles=None, fMassKey='mz', sMassKey='obsMz', isotopeErrorList=(0), precursorTolerance=5, toleranceUnit='ppm', rtExpansionUp=0.10, rtExpansionDown=0.05, matchCharge=True, scoreKey='pep', largerBetter=False): """Annotate :class:`Fi <maspy.core.Fi>` (Feature items) by matching :class:`Si <maspy.core.Si>` (Spectrum items) or :class:`Sii <maspy.core.Sii>` (Spectrum identification items). :param fiContainer: :class:`maspy.core.FeatureContainer`, contains ``Fi``. :param specContainer: :class:`maspy.core.MsrunContainer` or :class:`maspy.core.SiiContainer`, contains ``Si`` or ``Sii``. :param specfiles: filenames of ms-run files, if specified consider only items from those files :type specfiles: str, list or None :param fMassKey: mass attribute key in :attr:`Fi.__dict__` :param sMassKey: mass attribute key in :attr:`Si.__dict__` or :attr:`Sii.__dict__` (eg 'obsMz', 'excMz') :param isotopeErrorList: allowed isotope errors relative to the spectrum mass, for example "0" or "1". If no feature has been matched with isotope error 0, the spectrum mass is increased by the mass difference of carbon isotopes 12 and 13 and matched again. The different isotope error values are tested in the specified order therefore "0" should normally be the first value of the list. :type isotopeErrorList: list or tuple of int :param precursorTolerance: the largest allowed mass deviation of ``Si`` or ``Sii`` relative to ``Fi`` :param toleranceUnit: defines how the ``precursorTolerance`` is applied to the mass value of ``Fi``. ``"ppm": mass * (1 +/- tolerance*1E-6)`` or ``"da": mass +/- value`` :param rtExpansionUp: relative upper expansion of ``Fi`` retention time area. ``limitHigh = Fi.rtHigh + (Fi.rtHigh - Fi.rtLow) * rtExpansionUp`` :param rtExpansionDown: relative lower expansion of ``Fi`` retention time area. ``limitLow = Fi.rtLow - (Fi.rtHigh - Fi.rtLow) * rtExpansionDown`` :param matchCharge: bool, True if ``Fi`` and ``Si`` or ``Sii`` must have the same ``charge`` state to be matched. :param scoreKey: ``Sii`` attribute name used for scoring the identification reliability :param largerBetter: bool, True if higher score value means a better identification reliability .. note: Concerning the feature retention area expansion. If ``Si`` or ``Sii`` is matched to multiple ``Fi`` the rt expansion is removed and the matching is repeated. .. note: If the ``specContainer`` is a ``SiiContainer`` then matched ``Fi`` are annotated with :attr:`Sii.peptide`, if multiple ``Sii`` are matched to ``Fi`` the one with the best score is used. #TODO: this function is nested pretty badly and should maybe be rewritten #TODO: replace tolerance unit "ppm" by tolerance mode "relative" and change repsective calculations """ isotopeErrorList = aux.toList(isotopeErrorList) if specContainer.__class__.__name__ == 'MsrunContainer': listKeySpecIds = 'siIds' else: listKeySpecIds = 'siiIds' specContainerSpecfiles = [_ for _ in viewkeys(specContainer.info)] if specfiles is not None: specfiles = aux.toList(specfiles) else: specfiles = [_ for _ in viewkeys(fiContainer.info)] specfiles = list(set(specfiles).intersection(set(specContainerSpecfiles))) for specfile in specfiles: multiMatchCounter = int() isotopeErrorMatchCounter = int() specArrays = specContainer.getArrays([sMassKey, 'rt', 'charge', 'msLevel'], specfiles=specfile ) featureArrays = fiContainer.getArrays(['rtHigh', 'rtLow', 'charge', fMassKey], specfiles=specfile, sort=fMassKey ) featureArrays['rtHighExpanded'] = (featureArrays['rtHigh'] + (featureArrays['rtHigh'] - featureArrays['rtLow']) * rtExpansionUp ) featureArrays['rtLowExpanded'] = (featureArrays['rtLow'] - (featureArrays['rtHigh'] - featureArrays['rtLow']) * rtExpansionDown ) specFeatureDict = dict() ## key = scanNr, value = set(featureKeys) featureSpecDict = dict() ## key = featureKey, value = set(scanNrs) for specPos, specId in enumerate(specArrays['id']): specZ = specArrays['charge'][specPos] if specZ is None: continue specMass = specArrays[sMassKey][specPos] specRt = specArrays['rt'][specPos] matchComplete = False isotopeErrorPos = 0 while not matchComplete: isotopeError = isotopeErrorList[isotopeErrorPos] # calculate mass limits for each isotope error if toleranceUnit.lower() == 'ppm': specMassHigh = ((specMass + isotopeError * 1.003355 / specZ) * (1 + precursorTolerance*1E-6) ) specMassLow = ((specMass + isotopeError * 1.003355 / specZ) * (1 - precursorTolerance*1E-6) ) elif toleranceUnit.lower() == 'da': specMassHigh = ((specMass + isotopeError * 1.003355 / specZ) + precursorTolerance ) specMassLow = ((specMass + isotopeError * 1.003355 / specZ) - precursorTolerance ) posL = bisect.bisect_left(featureArrays[fMassKey], specMassLow ) posR = bisect.bisect_right(featureArrays[fMassKey], specMassHigh ) if matchCharge: chargeMask = (featureArrays['charge'][posL:posR] == specZ) fRtHighKey = 'rtHighExpanded' fRtLowKey = 'rtLowExpanded' for fRtHighKey, fRtLowKey in [('rtHighExpanded', 'rtLowExpanded'), ('rtHigh', 'rtLow') ]: rtMask = ((featureArrays[fRtLowKey][posL:posR] <= specRt) & (featureArrays[fRtHighKey][posL:posR] >= specRt) ) if matchCharge: matchedFeatureIds = featureArrays['id'][posL:posR][rtMask & chargeMask] else: matchedFeatureIds = featureArrays['id'][posL:posR][rtMask] if len(matchedFeatureIds) <= 1: break # if exactly one feature has been matched, if len(matchedFeatureIds) > 0: if len(matchedFeatureIds) == 1: matchComplete = True if isotopeErrorList[isotopeErrorPos] != 0: isotopeErrorMatchCounter += 1 else: #Stop if Spectrum can be matched to multiple features multiMatchCounter += 1 break isotopeErrorPos += 1 if isotopeErrorPos >= len(isotopeErrorList): #Stop if all allowed isotope errors have been tested break if matchComplete: for featureId in matchedFeatureIds: getattr(fiContainer.container[specfile][featureId], listKeySpecIds ).append(specId) fiContainer.container[specfile][featureId].isMatched = True specFeatureDict[specId] = featureId featureSpecDict[featureId] = specId stats = dict() stats['totalFeatures'] = len(featureArrays['id']) stats['matchedFeatures'] = len(featureSpecDict) stats['relMatchedFeatures'] = round(100*stats['matchedFeatures']/stats['totalFeatures'], 1) stats['totalSpectra'] = len(specArrays['id'][(specArrays['msLevel'] != 1)]) stats['matchedSpectra'] = len(specFeatureDict) stats['relMatchedSpectra'] = round(100*stats['matchedSpectra']/stats['totalSpectra'], 1) print('------', specfile, '------') print('Annotated features:\t\t\t', stats['matchedFeatures'], '/', stats['totalFeatures'], '=', stats['relMatchedFeatures'], '%') print('Spectra matched to features:\t\t', stats['matchedSpectra'], '/', stats['totalSpectra'], '=', stats['relMatchedSpectra'], '%') if multiMatchCounter != 0: print('Discarded because of multiple matches:\t', multiMatchCounter) if isotopeErrorMatchCounter != 0: print('Isotope error matched spectra:\t\t', isotopeErrorMatchCounter) #annotate feature with sii information (peptide, sequence, score) if isinstance(specContainer, maspy.core.SiiContainer): for featureId in viewkeys(featureSpecDict): matches = list() for specId in fiContainer.container[specfile][featureId].siiIds: _sii = specContainer.getValidItem(specfile, specId) score = getattr(_sii, scoreKey) peptide = _sii.peptide sequence = _sii.sequence matches.append([score, peptide, sequence]) matches.sort(reverse=largerBetter) fiContainer.container[specfile][featureId].isAnnotated = True fiContainer.container[specfile][featureId].score = matches[0][0] fiContainer.container[specfile][featureId].peptide = matches[0][1] fiContainer.container[specfile][featureId].sequence = matches[0][2]
def matchToFeatures(fiContainer, specContainer, specfiles=None, fMassKey='mz', sMassKey='obsMz', isotopeErrorList=(0), precursorTolerance=5, toleranceUnit='ppm', rtExpansionUp=0.10, rtExpansionDown=0.05, matchCharge=True, scoreKey='pep', largerBetter=False): """Annotate :class:`Fi <maspy.core.Fi>` (Feature items) by matching :class:`Si <maspy.core.Si>` (Spectrum items) or :class:`Sii <maspy.core.Sii>` (Spectrum identification items). :param fiContainer: :class:`maspy.core.FeatureContainer`, contains ``Fi``. :param specContainer: :class:`maspy.core.MsrunContainer` or :class:`maspy.core.SiiContainer`, contains ``Si`` or ``Sii``. :param specfiles: filenames of ms-run files, if specified consider only items from those files :type specfiles: str, list or None :param fMassKey: mass attribute key in :attr:`Fi.__dict__` :param sMassKey: mass attribute key in :attr:`Si.__dict__` or :attr:`Sii.__dict__` (eg 'obsMz', 'excMz') :param isotopeErrorList: allowed isotope errors relative to the spectrum mass, for example "0" or "1". If no feature has been matched with isotope error 0, the spectrum mass is increased by the mass difference of carbon isotopes 12 and 13 and matched again. The different isotope error values are tested in the specified order therefore "0" should normally be the first value of the list. :type isotopeErrorList: list or tuple of int :param precursorTolerance: the largest allowed mass deviation of ``Si`` or ``Sii`` relative to ``Fi`` :param toleranceUnit: defines how the ``precursorTolerance`` is applied to the mass value of ``Fi``. ``"ppm": mass * (1 +/- tolerance*1E-6)`` or ``"da": mass +/- value`` :param rtExpansionUp: relative upper expansion of ``Fi`` retention time area. ``limitHigh = Fi.rtHigh + (Fi.rtHigh - Fi.rtLow) * rtExpansionUp`` :param rtExpansionDown: relative lower expansion of ``Fi`` retention time area. ``limitLow = Fi.rtLow - (Fi.rtHigh - Fi.rtLow) * rtExpansionDown`` :param matchCharge: bool, True if ``Fi`` and ``Si`` or ``Sii`` must have the same ``charge`` state to be matched. :param scoreKey: ``Sii`` attribute name used for scoring the identification reliability :param largerBetter: bool, True if higher score value means a better identification reliability .. note: Concerning the feature retention area expansion. If ``Si`` or ``Sii`` is matched to multiple ``Fi`` the rt expansion is removed and the matching is repeated. .. note: If the ``specContainer`` is a ``SiiContainer`` then matched ``Fi`` are annotated with :attr:`Sii.peptide`, if multiple ``Sii`` are matched to ``Fi`` the one with the best score is used. #TODO: this function is nested pretty badly and should maybe be rewritten #TODO: replace tolerance unit "ppm" by tolerance mode "relative" and change repsective calculations """ isotopeErrorList = aux.toList(isotopeErrorList) if specContainer.__class__.__name__ == 'MsrunContainer': listKeySpecIds = 'siIds' else: listKeySpecIds = 'siiIds' specContainerSpecfiles = [_ for _ in viewkeys(specContainer.info)] if specfiles is not None: specfiles = aux.toList(specfiles) else: specfiles = [_ for _ in viewkeys(fiContainer.info)] specfiles = list(set(specfiles).intersection(set(specContainerSpecfiles))) for specfile in specfiles: multiMatchCounter = int() isotopeErrorMatchCounter = int() specArrays = specContainer.getArrays( [sMassKey, 'rt', 'charge', 'msLevel'], specfiles=specfile) featureArrays = fiContainer.getArrays( ['rtHigh', 'rtLow', 'charge', fMassKey], specfiles=specfile, sort=fMassKey) featureArrays['rtHighExpanded'] = ( featureArrays['rtHigh'] + (featureArrays['rtHigh'] - featureArrays['rtLow']) * rtExpansionUp) featureArrays['rtLowExpanded'] = ( featureArrays['rtLow'] - (featureArrays['rtHigh'] - featureArrays['rtLow']) * rtExpansionDown) specFeatureDict = dict() ## key = scanNr, value = set(featureKeys) featureSpecDict = dict() ## key = featureKey, value = set(scanNrs) for specPos, specId in enumerate(specArrays['id']): specZ = specArrays['charge'][specPos] if specZ is None: continue specMass = specArrays[sMassKey][specPos] specRt = specArrays['rt'][specPos] matchComplete = False isotopeErrorPos = 0 while not matchComplete: isotopeError = isotopeErrorList[isotopeErrorPos] # calculate mass limits for each isotope error if toleranceUnit.lower() == 'ppm': specMassHigh = ( (specMass + isotopeError * 1.003355 / specZ) * (1 + precursorTolerance * 1E-6)) specMassLow = ( (specMass + isotopeError * 1.003355 / specZ) * (1 - precursorTolerance * 1E-6)) elif toleranceUnit.lower() == 'da': specMassHigh = ( (specMass + isotopeError * 1.003355 / specZ) + precursorTolerance) specMassLow = ( (specMass + isotopeError * 1.003355 / specZ) - precursorTolerance) posL = bisect.bisect_left(featureArrays[fMassKey], specMassLow) posR = bisect.bisect_right(featureArrays[fMassKey], specMassHigh) if matchCharge: chargeMask = (featureArrays['charge'][posL:posR] == specZ) fRtHighKey = 'rtHighExpanded' fRtLowKey = 'rtLowExpanded' for fRtHighKey, fRtLowKey in [('rtHighExpanded', 'rtLowExpanded'), ('rtHigh', 'rtLow')]: rtMask = ((featureArrays[fRtLowKey][posL:posR] <= specRt) & (featureArrays[fRtHighKey][posL:posR] >= specRt)) if matchCharge: matchedFeatureIds = featureArrays['id'][posL:posR][ rtMask & chargeMask] else: matchedFeatureIds = featureArrays['id'][posL:posR][ rtMask] if len(matchedFeatureIds) <= 1: break # if exactly one feature has been matched, if len(matchedFeatureIds) > 0: if len(matchedFeatureIds) == 1: matchComplete = True if isotopeErrorList[isotopeErrorPos] != 0: isotopeErrorMatchCounter += 1 else: #Stop if Spectrum can be matched to multiple features multiMatchCounter += 1 break isotopeErrorPos += 1 if isotopeErrorPos >= len(isotopeErrorList): #Stop if all allowed isotope errors have been tested break if matchComplete: for featureId in matchedFeatureIds: getattr(fiContainer.container[specfile][featureId], listKeySpecIds).append(specId) fiContainer.container[specfile][featureId].isMatched = True specFeatureDict[specId] = featureId featureSpecDict[featureId] = specId stats = dict() stats['totalFeatures'] = len(featureArrays['id']) stats['matchedFeatures'] = len(featureSpecDict) stats['relMatchedFeatures'] = round( 100 * stats['matchedFeatures'] / stats['totalFeatures'], 1) stats['totalSpectra'] = len( specArrays['id'][(specArrays['msLevel'] != 1)]) stats['matchedSpectra'] = len(specFeatureDict) stats['relMatchedSpectra'] = round( 100 * stats['matchedSpectra'] / stats['totalSpectra'], 1) print('------', specfile, '------') print('Annotated features:\t\t\t', stats['matchedFeatures'], '/', stats['totalFeatures'], '=', stats['relMatchedFeatures'], '%') print('Spectra matched to features:\t\t', stats['matchedSpectra'], '/', stats['totalSpectra'], '=', stats['relMatchedSpectra'], '%') if multiMatchCounter != 0: print('Discarded because of multiple matches:\t', multiMatchCounter) if isotopeErrorMatchCounter != 0: print('Isotope error matched spectra:\t\t', isotopeErrorMatchCounter) #annotate feature with sii information (peptide, sequence, score) if isinstance(specContainer, maspy.core.SiiContainer): for featureId in viewkeys(featureSpecDict): matches = list() for specId in fiContainer.container[specfile][ featureId].siiIds: _sii = specContainer.getValidItem(specfile, specId) score = getattr(_sii, scoreKey) peptide = _sii.peptide sequence = _sii.sequence matches.append([score, peptide, sequence]) matches.sort(reverse=largerBetter) fiContainer.container[specfile][featureId].isAnnotated = True fiContainer.container[specfile][featureId].score = matches[0][ 0] fiContainer.container[specfile][featureId].peptide = matches[ 0][1] fiContainer.container[specfile][featureId].sequence = matches[ 0][2]