def __init__(self, extract_file=True): self.url = settings.get('lipidmaps_url') self.fname = settings.get('lipidmaps_fname') self.curl = _curl.Curl(self.url, large=True, silent=False) if extract_file: self.efname = os.path.join(settings.get('cachedir'), self.fname.split('/')[-1]) with open(self.efname, 'wb') as efp: for l in self.curl.result[self.fname]: efp.write(l) efp = open( os.path.join(settings.get('cachedir'), self.fname.split('/')[-1]), 'rb') sdf.SdfReader.__init__(self, efp) else: sdf.SdfReader.__init__(self, self.curl.result[self.fname]) self.nameproc = lipidname.LipidNameProcessor(database='lipidmaps', iso=True)
def __init__(self, fname, label=None, charge=1, rt_tolerance=None, drift=1.0, tolerance=None): """ Provides methods for looking up MS2 scans from an MGF file. """ session.Logger.__init__(self, name='mgf') self.fname = fname self.label = label self.charge = charge self.rt_tolerance = rt_tolerance or settings.get('deltart_threshold') self.drift = drift self.index() self.ms2_rt_within_range = settings.get('ms2_rt_within_range') self.tolerance = (tolerance or settings.get('precursor_match_tolerance')) self._log('MGF reader initialized for file `%s`, ' 'looking up MS2 spectra for precursor features %s, ' 'with a mass tolerance of %.01f ppm.' % ( self.fname, (' in RT range +/-%.02f' % self.rt_tolerance) if self.ms2_rt_within_range else 'with ignoring RT', self.tolerance, ))
def test_sec_profile_1(self): """ """ peakspath = settings.get('peaks_gltpd1_invitro') secpath = settings.get('sec_gltpd1_invitro') reader = sample.SampleReader( input_type='peaks', fname=peakspath, ) samples = reader.get_sampleset( sample_id_proc=sampleattrs.plate_sample_id_processor(), ) secprofile = sampleattrs.SECProfile( sec_path=secpath, samples=samples, ) assert secprofile.numof_samples == samples.numof_samples assert np.all(secprofile.attrs.sample_index_to_id == samples.attrs.sample_index_to_id) assert id(samples) in secprofile._sample_data assert id(secprofile) in samples._sample_data assert secprofile.profile.max() - 143.40397368421048 < 0.0001
def test_sec_profile_2(self): """ """ peakspath = settings.get('peaks_gltpd1_invivo') secpath = settings.get('sec_gltpd1_invivo') reader = sample.SampleReader( input_type='peaks', fname=peakspath, ) samples = reader.get_sampleset( sample_id_proc=sampleattrs.plate_sample_id_processor(), ) secprofile = sampleattrs.SECProfile( sec_path=secpath, samples=samples, start_volume=1.2, offsets=(0.015, 0.045), start_col=9, start_row='A', length=samples.numof_samples, ) assert secprofile.numof_samples == samples.numof_samples assert np.all(secprofile.attrs.sample_index_to_id == samples.attrs.sample_index_to_id) assert id(samples) in secprofile._sample_data assert id(secprofile) in samples._sample_data assert secprofile.profile015.max() - 20.92373913043478 < 0.0001 assert secprofile.profile045.argmax() == 2 assert secprofile.profile015.argmax() == 3 assert secprofile.profiles == {'profile045', 'profile015'}
def __init__( self, resources=None, tolerance=None, fa_args=None, sph_args=None, build=True, verbose=False, database_preference=None, ): """ Builds a database of molecules and provides methods for look up by masses and names. Metabolites are processed from databases like SwissLipids and LipidMaps and also autogenerated using classes defined in the `lipid` module. Args ---- resources : dict Databases to use with arguments. Keys are database names, values are tuples of classes and arguments. Default SwissLipids and LipidMaps. tolerance : int Mass lookup tolerance in ppm. fa_args : dict Fatty acyl arguments for autogenerated metabolites. sph_args : dict Sphingosine base arguments for autogenerated metabolites. """ self.verbose = verbose self.resources = resources or { 'SwissLipids': (SwissLipids, {}), 'LipidMaps': (LipidMaps, {}) } self.tolerance = tolerance or settings.get('ms1_tolerance') self._daltons_tolerance = False self.fa_args = fa_args or {'c': (4, 36), 'u': (0, 10)} self.sph_args = sph_args or {'c': (16, 22), 'u': (0, 1)} self.database_preference = (database_preference or settings.get('database_preference')) if build: self.build()
def __init__( self, mzs, ionmode, precursor=None, tolerance=None, ): """ Annotates all fragments in MS2 scan with possible identites. Args ---- :param np.ndarray mzs: MS2 scan fragment m/z's. :param str ionmode: MS ion mode; `pos` or `neg`. :param float precursor: Precursor ion m/z. :param tuple of arrays """ self.mzs = mzs self.ionmode = ionmode self.precursor = precursor self.tolerance = tolerance or settings.get('ms2_tolerance')
def get_default_file(self): """Returns the file name of the default fragment lists. These are stored in the `pfragmentsfile` and `nfragmentsfile` settings for positive and negative ion modes, respectively. The fragment list files should have at least 4 columns: * m/z as float * formula -- either formula or m/z should be provided, mass calculation from formula has priority over the mass in first column * human readable name * type: e.g. `[M+H]+`; importantly, for neutral losses this value must start with `NL` * headgroups (lipid classes), e.g.`PC;SM` See the built in fragment lists for examples. Parameters ---------- Returns ------- """ return settings.get('%sfragmentsfile' % ('p' if self.ionmode == 'pos' else 'n'))
def mz_lowest_error_from_name( self, measured_mz, adduct, name=None, **kwargs, ): """ Regarding a measured m/z and an assumed adduct type and name returns the m/z of the record with matching name and lowest error. """ exmasses = self.masses_from_name(name=name, **kwargs) if exmasses is not None: adduct_method = (settings.get('ex2ad_all')[adduct]) addmasses = np.array([ getattr(formula.Formula(exmass), adduct_method)() for exmass in exmasses ]) ppms = np.array( [common.ppm(addmass, measured_mz) for addmass in addmasses]) return addmasses[np.argmin(np.abs(ppms))]
def new_logger(name=None, logdir=None, verbosity=None, **kwargs): """ Returns a new logger with default settings (can be customized). Parameters ---------- name : str Custom name for the log. logdir : str Path to the directoty to store log files. verbosity : int Verbosity level, lowest is 0. Messages from levels above this won't be written to the log.. Returns ------- ``log.Logger`` instance. """ name = name or settings.get('module_name') logdir = logdir or '%s_log' % name return Logger( fname='%s__%s.log' % ( name, Logger.timestamp().replace(' ', '_').replace(':', '.'), ), verbosity=0, logdir=logdir, **kwargs, )
def cache_dir_exists(self): if self.cache_dir is None: self.cache_dir = settings.get('cachedir') if not os.path.exists(self.cache_dir): os.makedirs(self.cache_dir)
def __init__( self, fname, verbosity=None, console_level=None, logdir=None, max_width=200, ): """ fname : str Log file name. logdir : name Path to the directory containing the log files. verbosity : int Messages at and below this level will be written into the logfile. All other messages will be dropped. console_level : int Messages below this log level will be printed not only into logfile but also to the console. """ @_log_flush_timeloop.job(interval=datetime.timedelta( seconds=settings.get('log_flush_interval'))) def _flush(): self.flush() _log_flush_timeloop.start(block=False) self.wrapper = textwrap.TextWrapper( width=max_width, subsequent_indent=' ' * 22, break_long_words=False, ) self.logdir = self.get_logdir(logdir) self.fname = os.path.join(self.logdir, fname) self.verbosity = (verbosity if verbosity is not None else settings.get('log_verbosity')) self.console_level = (console_level if console_level is not None else settings.get('console_verbosity')) self.open_logfile() # sending some greetings self.msg('Welcome!') self.msg('Logger started, logging into `%s`.' % self.fname)
def get_logdir(self, dirname=None): """ Returns the path to log directory. Also creates the directory if does not exist. """ dirname = dirname or '%s_log' % settings.get('module_name') os.makedirs(dirname, exist_ok=True) return dirname
def __init__(self, database='swisslipids', with_alcohols=True, with_coa=True, iso=False): """ Processes lipid names used in databases. Converts names to the standard used in this module and extracts carbon count and unsaturation information and other features. """ self.database = database.lower() self.with_alcohols = with_alcohols self.with_coa = with_coa self.iso = iso self.lipnamesf = settings.get('lipnamesf') self.adducts_constraints = settings.get('adducts_constraints') self.gen_fa_greek() self.read_lipid_names()
def test_sec_unicorn_asc(self): """ """ path = settings.get('sec_unicorn_example') reader = sec.SECReader(path) highest = sorted( reader.profile(), key=lambda fr: fr.mean, reverse=True, )[0] assert highest.row == 'A' and highest.col == 10
def test_protein_containing_samples(self): """ """ peakspath = settings.get('peaks_gltpd1_invivo') secpath = settings.get('sec_gltpd1_invivo') reader = sample.SampleReader( input_type='peaks', fname=peakspath, ) samples = reader.get_sampleset( sample_id_proc=sampleattrs.plate_sample_id_processor(), ) secprofile = sampleattrs.SECProfile( sec_path=secpath, samples=samples, start_volume=1.2, offsets=(0.015, 0.045), start_col=9, start_row='A', length=samples.numof_samples, ) pcs = secprofile.protein_containing_samples() assert np.all( pcs.selection == np.array([False, False, True, True, False])) pcs = secprofile.protein_containing_samples(manual=['A10', 'A11']) assert np.all( pcs.selection == np.array([False, True, True, False, False])) pcs = secprofile.protein_containing_samples(exclude=['A12']) assert np.all( pcs.selection == np.array([False, False, True, False, False]))
def test_sampleset_from_peaks(self): """ """ peaksfile = settings.get('peaks_example') reader = sample.SampleReader(input_type='peaks', fname=peaksfile) samples = reader.get_sampleset( sampleset_args={ 'sample_id_proc': sampleattrs.plate_sample_id_processor(), }) assert abs(samples.mzs_by_sample[7, 3] - 375.0018) < 0.0001 assert samples.attrs.sample_index_to_id[-1] == ('A', 12) assert samples.attrs.attrs[0].attrs['label']['sample_id'] == ('A', 6)
def mz_from_name( self, adduct, name=None, database_preference=None, **kwargs, ): exmass = self.mass_from_name( name=name, database_preference=database_preference, **kwargs, ) if exmass is not None: adduct_method = (settings.get('ex2ad_all')[adduct]) return getattr(formula.Formula(exmass), adduct_method)()
def test_sec_xls(self): """ """ path = settings.get('sec_xls_example') reader = sec.SECReader(path) highest015 = sorted( reader.profile(start_volume=0.615), key=lambda fr: fr.mean, reverse=True, )[0] highest045 = sorted( reader.profile(start_volume=0.645), key=lambda fr: fr.mean, reverse=True, )[0] assert highest015.row == 'A' and highest015.col == 12 assert highest045.row == 'A' and highest045.col == 11
def set_paths(self): # default name for all files: # name of the input mzML with the path and extension removed if not hasattr(self, 'name'): input_file = self.profile_mzml or self.centroid_mzml self.name = os.path.splitext(os.path.basename(input_file))[0] # the working directory self.wd_root = self.wd_root or settings.get('ms_preproc_wd') self.wd = os.path.join(self.wd_root, self.name) os.makedirs(self.wd, exist_ok=True) self.centroid_mzml = self.centroid_mzml or '%s__peaks.mzML' % self.name self.centroid_mzml = os.path.join(self.wd, self.centroid_mzml) self.features_file = (self.features_file or '%s__features.featureXML' % self.name) self.features_file = os.path.join(self.wd, self.features_file)
def __init__(self, levels=set(['Species']), silent=False, nameproc_args=None, branched=False, exact_mass_formula_fallback=True): """ Downloads and serves the SwissLipids database. Automatically downloads the data at the first time and stores it in a cache file to be read from there at next usage. Scans the entire file and builds multiple indices in order to quickly access records upon request. Provides a number of methods to retrieve records either as lines or openbabel OBMol instances. Args ---- :param set levels: Levels in SwissLipids hierarchy. By default only "species". :param bool branched: Include lipids with branched alkyl chain (iso). :param dict nameproc_args: Arguments passed to the name processor. :param bool exact_mass_formula_fallback: If exact mass not available form SwissLipids calculate it from the formula. This is dangerous because the formula is sometimes dehydrogenated and charged state while exact mass should be uncharged with all hydrogenes """ self.silent = silent self.exact_mass_formula_fallback = exact_mass_formula_fallback self.nameproc_args = nameproc_args or {} self.set_levels(levels) self.url = settings.get('swisslipids_url') self.load() self.make_index()
def adduct_lookup( self, mz, adducts=None, ionmode=None, charge=None, adduct_constraints=True, tolerance=None, ): """Does a series of lookups in the database assuming various adducts. Calculates the exact mass for the m/z for each possible adduct and searches these exact masses in the database. Returns a dict of tuples with 3-3 numpy arrays. Keys of the dict are adduct types. The arrays are exact masses, database record details and accuracies (ppm). Parameters ---------- mz : adducts : (Default value = None) ionmode : (Default value = None) charge : (Default value = None) adduct_constraints : (Default value = True) tolerance : (Default value = None) Returns ------- """ result = {} mz = mzmod.Mz(mz) charge = (charge if charge is not None else 1 if ionmode == 'pos' else -1) if not adducts and ionmode in {'pos', 'neg'}: # we look up all adducts we have a method for adducts = list(settings.get('ex2ad')[abs(charge)][ionmode].keys()) ad_default = settings.get('adducts_default')[ionmode][abs(charge)] ad_constr = settings.get('adduct_constraints')[ionmode] exmethods = settings.get('ad2ex')[abs(charge)][ionmode] methods = dict((ad, exmethods[ad]) for ad in adducts) for ad, method in iteritems(methods): exmz = getattr(mz, method)() res = self.lookup_accuracy(exmz, tolerance=tolerance) if adduct_constraints: ires = tuple( i for i in xrange(res[0].shape[0]) if ((res[1][i].hg not in ad_constr and ad in ad_default) or (res[1][i].hg in ad_constr and ad in ad_constr[res[1][i].hg]))) res = (res[0][ires, ], res[1][ires, ], res[2][ires, ]) if len(res[0]): result[ad] = res return result
from lipyd import settings from lipyd import sample from lipyd import sampleattrs peaksfile = settings.get('peaks_example') peaksfile = 'stard10_pos.csv' reader = sample.SampleReader( input_type = 'peaks', fname = peaksfile ) samples = reader.get_sampleset( sampleset_args = { 'sample_id_proc': sampleattrs.plate_sample_id_processor(), } ) idx = samples.mzs_by_sample[ :,samples.attrs.sample_id_to_index[('A', 10)] ].argsort() samples.sort_all(by = idx) # m/z's for one sample: samples.mzs_by_sample[:,samples.attrs.sample_id_to_index[('A', 10)]] # intensities for one sample: samples.intensities[:,samples.attrs.sample_id_to_index[('A', 10)]] # RTs for one sample: samples.rts[:,samples.attrs.sample_id_to_index[('A', 10)]]
# ## 9: MS2 fragment database # Look up a negative mode fragment m/z in the database. It results an array with mass, fragment name, fragment type, aliphatic chain type, carbon count, unsaturation and charge in each row. At neutral losses the charge is 0. # In[89]: fragdb.lookup_neg(283.26) # Now let's annotate an MS2 scan with possible fragment identifications. To do this we open an example MGF file included in the module. The `lipyd.mgf` module serves MS2 scans from MGF files on demand. Btw the `lipyd.settings` module gives easy access for and control over near 100 customizable parameters. # In[104]: from lipyd import mgf from lipyd import settings mgffile = settings.get('mgf_example') mgfreader = mgf.MgfReader(mgffile) precursor = 590.45536 # this is a Cer-1P idx, rtdiff = mgfreader.lookup_scan_ids(precursor) # We found the following scans for precursor 590.455: # In[105]: idx # Select a scan from the ones above and annotate its fragments: # In[106]: scan = mgfreader.scan_by_id(1941)