def _get_n_pc(self, n_pc=None): max_n_pc = self.ev.size if n_pc is None: n_pc = max_n_pc return n_pc elif isinstance(n_pc, int): n_pc = min(n_pc, max_n_pc) return n_pc elif n_pc == 'auto': M, N = self.X.shape if M >= N: n_pc = self._infer_pc_() return n_pc else: info_('Cannot use `auto` if n_observations < ' 'n_features. Try with threshold 0.9999') n_pc = 0.9999 if 0 < n_pc < 1.0: # number of PC for which the cumulated explained variance is # less than a given ratio n_pc = np.searchsorted(self.ev_cum.data / 100., n_pc) + 1 return n_pc else: raise ValueError('could not get a valid number of components')
def nlssubprob(V, W, Hinit, tol, maxiter): """ H, grad : output solution and gradient iter : #iterations used V, W : constant matrices Hinit : initial solution tol : stopping tolerance maxiter : limit of iterations """ H = Hinit WtV = np.dot(W.T, V) WtW = np.dot(W.T, W) alpha = 1 beta = 0.1 for n_iter in range(1, maxiter + 1): grad = np.dot(WtW, H) - WtV if norm(grad * np.logical_or(grad < 0, H > 0)) < tol: break Hp = H # search step size for inner_iter in range(20): # gradient step Hn = H - alpha * grad # gradient step Hn *= Hn > 0 d = Hn - H gradd = np.dot(grad.ravel(), d.ravel()) dQd = np.dot(np.dot(WtW, d).ravel(), d.ravel()) suff_decr = 0.99 * gradd + 0.5 * dQd < 0 if inner_iter == 0: decr_alpha = not suff_decr Hp = H if decr_alpha: if suff_decr: H = Hn break else: alpha = alpha * beta else: if not suff_decr or (Hp == Hn).all(): H = Hp break else: alpha = alpha / beta Hp = Hn if n_iter == maxiter: info_('Max iter in nlssubprob') return H, grad, n_iter
def callback(*args, **kwargs): """ callback log.info function """ global niter, chi2, everyiter, ncalls niter += 1 if niter % everyiter != 0: return if not self.silent: display.clear_output(wait=True) info_(("Iterations: %d, Calls: %d (chi2: %.5f)" % (niter, ncalls, chi2))) sys.stdout.flush()
def check_filename_to_save(dataset, filename=None, save_as=False, confirm=True, **kwargs): from spectrochempy import NO_DIALOG from spectrochempy.core import info_ NODIAL = (NO_DIALOG or "DOC_BUILDING" in environ) and "KEEP_DIALOGS" not in environ if filename and pathclean(filename).parent.resolve() == Path.cwd(): filename = Path.cwd() / filename if not filename or save_as or filename.exists(): from spectrochempy.core import save_dialog # no filename provided open_diag = True caption = "Save as ..." if filename is None or (NODIAL and pathclean(filename).is_dir()): filename = dataset.name filename = filename + kwargs.get("suffix", ".scp") # existing filename provided elif filename.exists(): if confirm: caption = "File exists. Confirm overwrite" else: info_( f"A file {filename} was present and has been overwritten.") open_diag = False if not NODIAL and open_diag: filename = save_dialog( caption=kwargs.pop("caption", caption), filename=filename, filters=kwargs.pop("filetypes", ["All file types (*.*)"]), **kwargs, ) if filename is None: # this is probably due to a cancel action for an open dialog. return return pathclean(filename)
def test_ndmath_classmethod_implementation(nd2d, name): nd = nd2d.copy() try: getattr(NDDataset, name) except AttributeError: info_("\n{} is not yet implemented".format(name)) try: getattr(np.ma, name) getattr(np, name)(nd) except AttributeError: info_("\n{} is not a np.ma method".format(name)) except TypeError as e: if "required positional" in e.args[0]: pass else: raise TypeError(*e.args)
def _enabled_process(self, flag): if flag: self._io.children = [ self._load_button, self._process_button, self._save_button, ] self._controls.children = [ self._limits_control, self._method_control, self._interpolation_control, self._ranges_control, ] else: self._io.children = [self._load_button] self._controls.children = [] with self._output: info_("No data have been defined.\n" "Use the upload button to load data to be processed!.")
def _read_spa(*args, **kwargs): dataset, filename = args content = kwargs.get("content", False) if content: fid = io.BytesIO(content) else: fid = open(filename, "rb") return_ifg = kwargs.get("return_ifg", None) # Read name: # The name starts at position hex 1e = decimal 30. Its max length # is 256 bytes. It is the original filename under which the spectrum has # been saved: it won't match with the actual filename if a subsequent # renaming has been done in the OS. spa_name = _readbtext(fid, 30, 256) # The acquisition date (GMT) is at hex 128 = decimal 296. # Second since 31/12/1899, 00:00 fid.seek(296) timestamp = _fromfile(fid, dtype="uint32", count=1) acqdate = datetime(1899, 12, 31, 0, 0, tzinfo=timezone.utc) + timedelta( seconds=int(timestamp) ) acquisitiondate = acqdate # Transform back to timestamp for storage in the Coord object # use datetime.fromtimestamp(d, timezone.utc)) to transform back to datetime object timestamp = acqdate.timestamp() # From hex 120 = decimal 304, the spectrum is described # by a block of lines starting with "key values", # for instance hex[02 6a 6b 69 1b 03 82] -> dec[02 106 107 105 27 03 130] # Each of these lines provides positions of data and metadata in the file: # # key: hex 02, dec 02: position of spectral header (=> nx, # firstx, lastx, nscans, nbkgscans) # key: hex 03, dec 03: intensity position # # key: hex 04, dec 04: user text position (custom info, can be present # several times. The text length is five bytes later) # key: hex 1B, dec 27: position of History text, The text length # is five bytes later # key: hex 53, dec 83: probably not a position, present when 'Retrieved from library' # key: hex 64, dec 100: ? # key: hex 66 dec 102: sample interferogram # key: hex 67 dec 103: background interferogram # key: hex 69, dec 105: ? # key: hex 6a, dec 106: ? # key: hex 80, dec 128: ? # key: hex 82, dec 130: position of 'Experiment Information', The text length # is five bytes later. The block gives Experiment filename (at +10) # Experiment title (+90), custom text (+254), accessory name (+413) # key: hex 92, dec 146: position of 'custom infos', The text length # is five bytes later. # # The line preceding the block start with '01' or '0A' # The lines after the block generally start with '00', except in few cases where # they start by '01'. In such cases, the '53' key is also present # (before the '1B'). # scan "key values" pos = 304 spa_comments = [] # several custom comments can be present while "continue": fid.seek(pos) key = _fromfile(fid, dtype="uint8", count=1) # print(key, end=' ; ') if key == 2: # read the position of the header fid.seek(pos + 2) pos_header = _fromfile(fid, dtype="uint32", count=1) info = _read_header(fid, pos_header) elif key == 3 and return_ifg is None: intensities = _getintensities(fid, pos) elif key == 4: fid.seek(pos + 2) comments_pos = _fromfile(fid, "uint32", 1) fid.seek(pos + 6) comments_len = _fromfile(fid, "uint32", 1) fid.seek(comments_pos) spa_comments.append(fid.read(comments_len).decode("latin-1", "replace")) elif key == 27: fid.seek(pos + 2) history_pos = _fromfile(fid, "uint32", 1) fid.seek(pos + 6) history_len = _fromfile(fid, "uint32", 1) spa_history = _readbtext(fid, history_pos, history_len) elif key == 102 and return_ifg == "sample": s_ifg_intensities = _getintensities(fid, pos) elif key == 103 and return_ifg == "background": b_ifg_intensities = _getintensities(fid, pos) elif key == 00 or key == 1: break pos += 16 fid.close() if (return_ifg == "sample" and "s_ifg_intensities" not in locals()) or ( return_ifg == "background" and "b_ifg_intensities" not in locals() ): info_("No interferogram found, read_spa returns None") return None elif return_ifg == "sample": intensities = s_ifg_intensities elif return_ifg == "background": intensities = b_ifg_intensities # load intensity into the NDDataset dataset.data = np.array(intensities[np.newaxis], dtype="float32") if return_ifg == "background": title = "sample acquisition timestamp (GMT)" # bckg acquisition date is not known for the moment... else: title = "acquisition timestamp (GMT)" # no ambiguity here _y = Coord( [timestamp], title=title, units="s", labels=([acquisitiondate], [filename]), ) # useful when a part of the spectrum/ifg has been blanked: dataset.mask = np.isnan(dataset.data) if return_ifg is None: default_description = f"# Omnic name: {spa_name}\n# Filename: {filename.name}" dataset.units = info["units"] dataset.title = info["title"] # now add coordinates nx = info["nx"] firstx = info["firstx"] lastx = info["lastx"] xunit = info["xunits"] xtitle = info["xtitle"] spacing = (lastx - firstx) / (nx - 1) _x = LinearCoord( offset=firstx, increment=spacing, size=nx, title=xtitle, units=xunit ) else: # interferogram if return_ifg == "sample": default_description = ( f"# Omnic name: {spa_name} : sample IFG\n # Filename: {filename.name}" ) else: default_description = f"# Omnic name: {spa_name} : background IFG\n # Filename: {filename.name}" spa_name += ": Sample IFG" dataset.units = "V" dataset.title = "detector signal" _x = LinearCoord( offset=0, increment=1, size=len(intensities), title="data points", units=None, ) dataset.set_coordset(y=_y, x=_x) dataset.name = spa_name # to be consistent with omnic behaviour dataset.filename = str(filename) # Set origin, description, history, date # Omnic spg file don't have specific "origin" field stating the oirigin of the data dataset.description = kwargs.get("description", default_description) + "\n" if len(spa_comments) > 1: dataset.description += "# Comments from Omnic:\n" for comment in spa_comments: dataset.description += comment + "\n---------------------\n" dataset.history = str(datetime.now(timezone.utc)) + ":imported from spa file(s)" if "spa_history" in locals(): if len("spa_history".strip(" ")) > 0: dataset.history = ( "Data processing history from Omnic :\n------------------------------------\n" + spa_history ) dataset._date = datetime.now(timezone.utc) dataset.meta.collection_length = info["collection_length"] / 100 * ur("s") dataset.meta.optical_velocity = info["optical_velocity"] dataset.meta.laser_frequency = info["reference_frequency"] * ur("cm^-1") if dataset.x.units is None and dataset.x.title == "data points": # interferogram dataset.meta.interferogram = True dataset.meta.td = list(dataset.shape) dataset.x._zpd = int(np.argmax(dataset)[-1]) dataset.x.set_laser_frequency() dataset.x._use_time_axis = ( False # True to have time, else it will be optical path difference ) return dataset
def _read_header(fid, pos): """ read spectrum/ifg/series header Parameters ---------- fid : BufferedReader The buffered binary stream. pos : int The position of the header (see Notes). Returns ------- dict, int Dictionary and current position in file Notes ----- So far, the header structure is as follows: - starts with b'\x01' , b'\x02', b'\x03' ... maybe indicating the header "type" - nx (UInt32): 4 bytes behind - xunits (UInt8): 8 bytes behind. So far, we have the following correspondence: `x\01`: wavenumbers, cm-1 `x\02`: datapoints (interferogram) `x\03`: wavelength, nm `x\04': wavelength, um `x\20': Raman shift, cm-1 - data units (UInt8): 12 bytes behind. So far, we have the following correspondence: `x\11`: absorbance `x\10`: transmittance (%) `x\0B`: reflectance (%) `x\0C`: Kubelka_Munk `x\16`: Volts (interferogram) `x\1A`: photoacoustic `x\1F`: Raman intensity - first x value (float32), 16 bytes behind - last x value (float32), 20 bytes behind - ... unknown - scan points (UInt32), 28 bytes behind - zpd (UInt32), 32 bytes behind - number of scans (UInt32), 36 bytes behind - ... unknown - number of background scans (UInt32), 52 bytes behind - ... unknown - collection length in 1/100th of sec (UIint32), 68 bytes behind - ... unknown - reference frequency (float32), 80 bytes behind - ... - optical velocity (float32), 188 bytes behind - ... - spectrum history (text), 208 bytes behind For "rapid-scan" srs files: - series name (text), 938 bytes behind - collection length (float32), 1002 bytes behind - last y (float 32), 1006 bytes behind - first y (float 32), 1010 bytes behind - ny (UInt32), 1026 ... y unit could be at pos+1030 with 01 = minutes ? - history (text), 1200 bytes behind (only initila hgistopry. When reprocessed, updated history is at the end of the file after the b`\xFF\xFF\xFF\xFF\xFF\xFF\xFF\xFF\xFF\xFF\xFF\xFF\xFF\xFF\xFF\xFF` sequence """ out = {} # determine the type of file fid.seek(0) bytes = fid.read(18) if bytes == b"Spectral Data File": filetype = "spa, spg" elif bytes == b"Spectral Exte File": filetype = "srs" # nx fid.seek(pos + 4) out["nx"] = _fromfile(fid, "uint32", count=1) # xunits fid.seek(pos + 8) key = _fromfile(fid, dtype="uint8", count=1) if key == 1: out["xunits"] = "cm^-1" out["xtitle"] = "wavenumbers" elif key == 2: out["xunits"] = None out["xtitle"] = "data points" elif key == 3: # pragma: no cover out["xunits"] = "nm" out["xtitle"] = "wavelengths" elif key == 4: # pragma: no cover out["xunits"] = "um" out["xtitle"] = "wavelengths" elif key == 32: # pragma: no cover out["xunits"] = "cm^-1" out["xtitle"] = "raman shift" else: # pragma: no cover out["xunits"] = None out["xtitle"] = "xaxis" info_("The nature of x data is not recognized, xtitle is set to 'xaxis'") # data units fid.seek(pos + 12) key = _fromfile(fid, dtype="uint8", count=1) if key == 17: out["units"] = "absorbance" out["title"] = "absorbance" elif key == 16: # pragma: no cover out["units"] = "percent" out["title"] = "transmittance" elif key == 11: # pragma: no cover out["units"] = "percent" out["title"] = "reflectance" elif key == 12: # pragma: no cover out["units"] = None out["title"] = "log(1/R)" elif key == 20: # pragma: no cover out["units"] = "Kubelka_Munk" out["title"] = "Kubelka-Munk" elif key == 21: out["units"] = None out["title"] = "reflectance" elif key == 22: out["units"] = "V" out["title"] = "detector signal" elif key == 26: # pragma: no cover out["units"] = None out["title"] = "photoacoustic" elif key == 31: # pragma: no cover out["units"] = None out["title"] = "Raman intensity" else: # pragma: no cover out["units"] = None out["title"] = "intensity" info_("The nature of data is not recognized, title set to 'Intensity'") # firstx, lastx fid.seek(pos + 16) out["firstx"] = _fromfile(fid, "float32", 1) fid.seek(pos + 20) out["lastx"] = _fromfile(fid, "float32", 1) fid.seek(pos + 28) out["scan_pts"] = _fromfile(fid, "uint32", 1) fid.seek(pos + 32) out["zpd"] = _fromfile(fid, "uint32", 1) fid.seek(pos + 36) out["nscan"] = _fromfile(fid, "uint32", 1) fid.seek(pos + 52) out["nbkgscan"] = _fromfile(fid, "uint32", 1) fid.seek(pos + 68) out["collection_length"] = _fromfile(fid, "uint32", 1) fid.seek(pos + 80) out["reference_frequency"] = _fromfile(fid, "float32", 1) fid.seek(pos + 188) out["optical_velocity"] = _fromfile(fid, "float32", 1) if filetype == "spa, spg": out["history"] = _readbtext(fid, pos + 208, None) if filetype == "srs": if out["nbkgscan"] == 0: # an interferogram in rapid scan mode if out["firstx"] > out["lastx"]: out["firstx"], out["lastx"] = out["lastx"], out["firstx"] out["name"] = _readbtext(fid, pos + 938, 256) fid.seek(pos + 1002) out["collection_length"] = _fromfile(fid, "float32", 1) * 60 fid.seek(pos + 1006) out["lasty"] = _fromfile(fid, "float32", 1) fid.seek(pos + 1010) out["firsty"] = _fromfile(fid, "float32", 1) fid.seek(pos + 1026) out["ny"] = _fromfile(fid, "uint32", 1) # y unit could be at pos+1030 with 01 = minutes ? out["history"] = _readbtext(fid, pos + 1200, None) if _readbtext(fid, pos + 208, 256)[:10] == "Background": # it is the header of a background out["background_name"] = _readbtext(fid, pos + 208, 256)[10:] return out
def __init__(self, dataset, guess, **kwargs): # list all default arguments: tol = kwargs.get("tol", 0.1) maxit = kwargs.get("maxit", 50) maxdiv = kwargs.get("maxdiv", 5) nonnegConc = kwargs.get("nonnegConc", "all") unimodConc = kwargs.get("unimodConc", "all") unimodConcTol = kwargs.get("unimodConcTol", 1.1) unimodConcMod = kwargs.get("unimodMod", "strict") if "unimodTol" in kwargs.keys(): warnings.warn("unimodTol deprecated, use unimodConcTol instead", DeprecationWarning) unimodConcTol = kwargs.get("unimodTol", 1.1) if "unimodMod" in kwargs.keys(): warnings.warn("unimodMod deprecated, use unimodConcMod instead", DeprecationWarning) unimodConcMod = kwargs.get("unimodConcMod", "strict") monoDecConc = kwargs.get("monoDecConc", None) monoIncTol = kwargs.get("monoIncTol", 1.1) monoIncConc = kwargs.get("monoIncConc", None) monoDecTol = kwargs.get("monoDecTol", 1.1) closureConc = kwargs.get("closureConc", None) closureTarget = kwargs.get("closureTarget", "default") closureMethod = kwargs.get("closureMethod", "scaling") hardConc = kwargs.get("hardConc", None) getConc = kwargs.get("getConc", None) argsGetConc = kwargs.get("argsGetConc", None) hardC_to_C_idx = kwargs.get("hardC_to_C_idx", "default") unimodSpec = kwargs.get("unimodSpec", None) unimodSpecTol = kwargs.get("unimodSpecTol", 1.1) unimodSpecMod = kwargs.get("unimodSpecMod", "strict") nonnegSpec = kwargs.get("nonnegSpec", "all") normSpec = kwargs.get("normSpec", None) if "verbose" in kwargs.keys(): warnings.warn( "verbose deprecated. Instead, use set_loglevel(INFO) before launching MCRALS", DeprecationWarning, ) set_loglevel(INFO) # Check initial data # ------------------------------------------------------------------------ initConc, initSpec = False, False if type(guess) is np.ndarray: guess = NDDataset(guess) X = dataset if X.shape[0] == guess.shape[0]: initConc = True C = guess.copy() C.name = "Pure conc. profile, mcs-als of " + X.name nspecies = C.shape[1] elif X.shape[1] == guess.shape[1]: initSpec = True St = guess.copy() St.name = "Pure spectra profile, mcs-als of " + X.name nspecies = St.shape[0] else: raise ValueError("the dimensions of guess do not match the data") ny, _ = X.shape # makes a PCA with same number of species for further comparison Xpca = PCA(X).reconstruct(n_pc=nspecies) # reset default text to indexes # ------------------------------ if nonnegConc == "all": nonnegConc = np.arange(nspecies) elif nonnegConc is None: nonnegConc = [] elif nonnegConc != [] and (len(nonnegConc) > nspecies or max(nonnegConc) + 1 > nspecies): raise ValueError( f"The guess has only {nspecies} species, please check nonnegConc" ) if unimodConc == "all": unimodConc = np.arange(nspecies) elif unimodConc is None: unimodConc = [] elif unimodConc != [] and (len(unimodConc) > nspecies or max(unimodConc) + 1 > nspecies): raise ValueError( f"The guess has only {nspecies} species, please check unimodConc" ) if closureTarget == "default": closureTarget = np.ones(ny) elif len(closureTarget) != ny: raise ValueError( f"The data contain only {ny} observations, please check closureTarget" ) if hardC_to_C_idx == "default": hardC_to_C_idx = np.arange(nspecies) elif len(hardC_to_C_idx ) > nspecies or max(hardC_to_C_idx) + 1 > nspecies: raise ValueError( f"The guess has only {nspecies} species, please check hardC_to_C_idx" ) # constraints on spectra if unimodSpec == "all": unimodSpec = np.arange(nspecies) elif unimodSpec is None: unimodSpec = [] elif unimodSpec != [] and (len(unimodSpec) > nspecies or max(unimodSpec) + 1 > nspecies): raise ValueError( f"The guess has only {nspecies} species, please check unimodSpec" ) if nonnegSpec == "all": nonnegSpec = np.arange(nspecies) elif nonnegSpec is None: nonnegSpec = [] elif nonnegSpec != [] and (len(nonnegSpec) > nspecies or max(nonnegSpec) + 1 > nspecies): raise ValueError( f"The guess has only {nspecies} species, please check nonnegSpec" ) # Compute initial spectra or concentrations (first iteration...) # ------------------------------------------------------------------------ if initConc: if C.coordset is None: C.set_coordset(y=X.y, x=C.x) St = NDDataset(np.linalg.lstsq(C.data, X.data, rcond=None)[0]) St.name = "Pure spectra profile, mcs-als of " + X.name St.title = X.title cy = C.x.copy() if C.x else None cx = X.x.copy() if X.x else None St.set_coordset(y=cy, x=cx) if initSpec: if St.coordset is None: St.set_coordset(y=St.y, x=X.x) Ct = np.linalg.lstsq(St.data.T, X.data.T, rcond=None)[0] C = NDDataset(Ct.T) C.name = "Pure conc. profile, mcs-als of " + X.name C.title = "concentration" cx = St.y.copy() if St.y else None cy = X.y.copy() if X.y else None C.set_coordset(y=cy, x=cx) change = tol + 1 stdev = X.std() niter = 0 ndiv = 0 log = "*** ALS optimisation log***\n" log += "#iter Error/PCA Error/Exp %change \n" log += "------------------------------------------------- \n" info_(log) while change >= tol and niter < maxit and ndiv < maxdiv: C.data = np.linalg.lstsq(St.data.T, X.data.T, rcond=None)[0].T niter += 1 # Force non-negative concentration # -------------------------------- if nonnegConc is not None: for s in nonnegConc: C.data[:, s] = C.data[:, s].clip(min=0) # Force unimodal concentration # ---------------------------- if unimodConc != []: C.data = _unimodal_2D( C.data, idxes=unimodConc, axis=0, tol=unimodConcTol, mod=unimodConcMod, ) # Force monotonic increase # ------------------------ if monoIncConc is not None: for s in monoIncConc: for curid in np.arange(ny - 1): if C.data[curid + 1, s] < C.data[curid, s] / monoIncTol: C.data[curid + 1, s] = C.data[curid, s] # Force monotonic decrease # ---------------------------------------------- if monoDecConc is not None: for s in monoDecConc: for curid in np.arange(ny - 1): if C.data[curid + 1, s] > C.data[curid, s] * monoDecTol: C.data[curid + 1, s] = C.data[curid, s] # Closure # ------------------------------------------ if closureConc is not None: if closureMethod == "scaling": Q = np.linalg.lstsq(C.data[:, closureConc], closureTarget.T, rcond=None)[0] C.data[:, closureConc] = np.dot(C.data[:, closureConc], np.diag(Q)) elif closureMethod == "constantSum": totalConc = np.sum(C.data[:, closureConc], axis=1) C.data[:, closureConc] = (C.data[:, closureConc] * closureTarget[:, None] / totalConc[:, None]) # external concentration profiles # ------------------------------------------ if hardConc is not None: extOutput = getConc(*argsGetConc) if isinstance(extOutput, dict): fixedC = extOutput["concentrations"] argsGetConc = extOutput["new_args"] else: fixedC = extOutput C.data[:, hardConc] = fixedC[:, hardC_to_C_idx] # stores C in C_hard Chard = C.copy() # compute St St.data = np.linalg.lstsq(C.data, X.data, rcond=None)[0] # stores St in Stsoft Stsoft = St.copy() # Force non-negative spectra # -------------------------- if nonnegSpec is not None: St.data[nonnegSpec, :] = St.data[nonnegSpec, :].clip(min=0) # Force unimodal spectra # ---------------------------- if unimodSpec != []: St.data = _unimodal_2D( St.data, idxes=unimodSpec, axis=1, tol=unimodSpecTol, mod=unimodSpecMod, ) # recompute C for consistency(soft modeling) C.data = np.linalg.lstsq(St.data.T, X.data.T)[0].T # rescale spectra & concentrations if normSpec == "max": alpha = np.max(St.data, axis=1).reshape(nspecies, 1) St.data = St.data / alpha C.data = C.data * alpha.T elif normSpec == "euclid": alpha = np.linalg.norm(St.data, axis=1).reshape(nspecies, 1) St.data = St.data / alpha C.data = C.data * alpha.T # compute residuals # ----------------- X_hat = dot(C, St) stdev2 = (X_hat - X.data).std() change = 100 * (stdev2 - stdev) / stdev stdev = stdev2 stdev_PCA = (X_hat - Xpca.data).std() # logentry = "{:3d} {:10f} {:10f} {:10f}".format( niter, stdev_PCA, stdev2, change) log += logentry + "\n" info_(logentry) if change > 0: ndiv += 1 else: ndiv = 0 change = -change if change < tol: logentry = "converged !" log += logentry + "\n" info_(logentry) if ndiv == maxdiv: logline = ( f"Optimization not improved since {maxdiv} iterations... unconverged " f"or 'tol' set too small ?\n") logline += "Stop ALS optimization" log += logline + "\n" info_(logline) if niter == maxit: logline = "Convergence criterion ('tol') not reached after {:d} iterations.".format( maxit) logline += "Stop ALS optimization" log += logline + "\n" info_(logline) self._X = X self._params = { "tol": tol, "maxit": maxit, "maxdiv": maxdiv, "nonnegConc": nonnegConc, "unimodConc": unimodConc, "unimodConcTol": unimodConcTol, "unimodConcMod": unimodConcMod, "closureConc": closureConc, "closureTarget ": closureTarget, "closureMethod": closureMethod, "monoDecConc": monoDecConc, "monoDecTol": monoDecTol, "monoIncConc": monoIncConc, "monoIncTol": monoIncTol, "hardConc": hardConc, "getConc": getConc, "argsGetConc": argsGetConc, "hardC_to_C_idx": hardC_to_C_idx, "nonnegSpec": nonnegSpec, "unimodSpec": unimodConc, "unimodSpecTol": unimodSpecTol, "unimodSpecMod": unimodSpecMod, "normSpec": normSpec, } self._C = C if hardConc is not None: self._fixedC = fixedC self._extOutput = extOutput else: self._fixedC = None self._extOutput = None self._St = St self._log = log self._Stsoft = Stsoft self._Chard = Chard
def _interpret(self, script): """ Interpreter of the script content """ # init some flags modlabel = None common = False fixed = False reference = False # create a new FitParameters instance fp = FitParameters() # set the number of experiments fp.expnumber = len(self.datasets) info_("The number of experiment(s) is set to %d" % fp.expnumber) # start interpreting ------------------------------------------------------ lines = script.split('\n') lc = 0 for item in lines: lc += 1 # -------------- count the lines line = item.strip() if line == '' or line.startswith("#"): # this is a blank or comment line, go to next line continue # split around the semi-column s = line.split(':') if len(s) != 2: raise ValueError( 'Cannot interpret line %d: A semi-column is missing?' % lc) key, values = s key = key.strip().lower() if key.startswith('model'): modlabel = values.lower().strip() if modlabel not in fp.models: fp.models.append(modlabel) common = False continue elif key.startswith('common') or key.startswith('vars'): common = True modlabel = 'common' continue elif key.startswith('shape'): shape = values.lower().strip() if shape is None: # or (shape not in self._list_of_models and shape not in self._list_of_baselines): raise ValueError( 'Shape of this model "%s" was not specified or is not implemented' % shape) fp.model[modlabel] = shape common = False continue elif key.startswith("experiment"): # must be in common if not common: raise ValueError( "'experiment_...' specification was found outside the common block." ) if "variables" in key: expvars = values.lower().strip() expvars = expvars.replace(',', ' ').replace(';', ' ') expvars = expvars.split() fp.expvars.extend(expvars) continue else: if modlabel is None and not common: raise ValueError( "The first definition should be a label for a model or a block of variables or constants." ) # get the parameters if key.startswith('*'): fixed = True reference = False key = key[1:].strip() elif key.startswith('$'): fixed = False reference = False key = key[1:].strip() elif key.startswith('>'): fixed = True reference = True key = key[1:].strip() else: raise ValueError( 'Cannot interpret line %d: A parameter definition must start with *,$ or >' % lc) # store this parameter s = values.split(',') s = [ss.strip() for ss in s] if len(s) > 1 and ('[' in s[0]) and (']' in s[1]): # list s[0] = "%s, %s" % (s[0], s[1]) if len(s) > 2: s[1:] = s[2:] if len(s) > 3: raise ValueError( 'line %d: value, min, max should be defined in this order' % lc) elif len(s) == 2: raise ValueError('only two items in line %d' % lc) # s.append('none') elif len(s) == 1: s.extend(['none', 'none']) value, mini, maxi = s if mini.strip().lower() in ['none', '']: mini = str(-1. / sys.float_info.epsilon) if maxi.strip().lower() in ['none', '']: maxi = str(+1. / sys.float_info.epsilon) if modlabel != 'common': ks = "%s_%s" % (key, modlabel) # print(ks) # if "ratio_line_1" in ks: # print('xxxx'+ks) fp.common[key] = False else: ks = "%s" % key fp.common[key] = True # if key in fp.expvars: # for i in xrange(len(self.datasets)): # ks = "%s_exp%d"%(ks, i) fp.reference[ks] = reference if not reference: val = value.strip() val = eval(val) if isinstance(val, list): # if the parameter is already a list, that's ok if the number of parameters is ok if len(val) != fp.expnumber: raise ValueError( 'the number of parameters for %s is not the number of experiments.' % len(val)) if key not in fp.expvars: raise ValueError( 'parameter %s is not declared as variable' % key) else: if key in fp.expvars: # we create a list of parameters corresponding val = [val] * fp.expnumber fp[ks] = val, mini.strip(), maxi.strip(), fixed else: fp[ks] = value.strip() return fp
def test_read_carroucell_with_dirname(): A = NDDataset.read_carroucell(os.path.join('irdata', 'carroucell_samp')) for x in A: info_(' ' + x.name + ': ' + str(x.shape)) assert len(A) == 11 assert A[3].shape == (6, 11098)
def read_carroucell(dataset=None, directory=None, **kwargs): """ Open .spa files in a directory after a carroucell experiment. The files for a given sample are grouped in NDDatasets (sorted by acquisition date). The NDDatasets are returned in a list sorted by sample number. When the file containing the temperature data is present, the temperature is read and assigned as a label to each spectrum. Parameters ---------- dataset : `NDDataset` The dataset to store the data and metadata. If None, a NDDataset is created. directory : str, optional If not specified, opens a dialog box. spectra : arraylike of 2 int (min, max), optional, default=None The first and last spectrum to be loaded as determined by their number. If None all spectra are loaded. discardbg : bool, optional, default=True If True : do not load background (sample #9). delta_clocks : int, optional, default=0 Difference in seconds between the clocks used for spectra and temperature acquisition. Defined as t(thermocouple clock) - t(spectrometer clock). Returns -------- nddataset |NDDataset| or list of |NDDataset|. See Also -------- read_topspin : Read TopSpin Bruker NMR spectra. read_omnic : Read Omnic spectra. read_opus : Read OPUS spectra. read_spg : Read Omnic *.spg grouped spectra. read_spa : Read Omnic *.Spa single spectra. read_srs : Read Omnic series. read_csv : Read CSV files. read_zip : Read Zip files. read_matlab : Read Matlab files. Notes ------ All files are expected to be present in the same directory and their filenames are expected to be in the format : X_samplename_YYY.spa and for the backround files : X_BCKG_YYYBG.spa where X is the sample holder number and YYY the spectrum number. Examples -------- """ # check if the first parameter is a dataset # because we allow not to pass it if not isinstance(dataset, NDDataset): # probably did not specify a dataset # so the first parameter must be the directory if isinstance(dataset, str) and dataset != '': directory = dataset directory = readdirname(directory) if not directory: # probably cancel has been chosen in the open dialog info_("No directory was selected.") return spectra = kwargs.get('spectra', None) discardbg = kwargs.get('discardbg', True) delta_clocks = datetime.timedelta(seconds=kwargs.get('delta_clocks', 0)) datasets = [] # get the sorted list of spa files in the directory spafiles = sorted([ f for f in os.listdir(directory) if (os.path.isfile(os.path.join(directory, f)) and f[-4:].lower() == '.spa') ]) # discard BKG files if discardbg: spafiles = sorted([f for f in spafiles if 'BCKG' not in f]) # select files if spectra is not None: [min, max] = spectra if discardbg: spafiles = sorted([ f for f in spafiles if min <= int(f.split('_')[2][:-4]) <= max and 'BCKG' not in f ]) if not discardbg: spafilespec = sorted([ f for f in spafiles if min <= int(f.split('_')[2][:-4]) <= max and 'BCKG' not in f ]) spafileback = sorted([ f for f in spafiles if min <= int(f.split('_')[2][:-6]) <= max and 'BCKG' in f ]) spafiles = spafilespec + spafileback curfilelist = [spafiles[0]] curprefix = spafiles[0][::-1].split("_", 1)[1][::-1] for f in spafiles[1:]: if f[::-1].split("_", 1)[1][::-1] != curprefix: datasets.append( NDDataset.read_omnic(curfilelist, sortbydate=True, directory=directory)) datasets[-1].name = os.path.basename(curprefix) curfilelist = [f] curprefix = f[::-1].split("_", 1)[1][::-1] else: curfilelist.append(f) datasets.append( NDDataset.read_omnic(curfilelist, sortbydate=True, directory=directory)) datasets[-1].name = os.path.basename(curprefix) # Now manage temperature Tfile = sorted( [f for f in os.listdir(directory) if f[-4:].lower() == '.xls']) if len(Tfile) == 0: print_("no temperature file") elif len(Tfile) > 1: warnings.warn( "several .xls/.csv files. The temperature will not be read") else: Tfile = Tfile[0] if Tfile[-4:].lower() == '.xls': book = xlrd.open_workbook(os.path.join(directory, Tfile)) # determine experiment start and end time (thermocouple clock) ti = datasets[0].y.labels[0][0] + delta_clocks tf = datasets[-1].y.labels[-1][0] + delta_clocks # get thermocouple time and T information during the experiment t = [] T = [] sheet = book.sheet_by_index(0) for i in range(9, sheet.nrows): try: time = datetime.datetime.strptime( sheet.cell(i, 0).value, '%d/%m/%y %H:%M:%S').replace( tzinfo=datetime.timezone.utc) if ti <= time <= tf: t.append(time) T.append(sheet.cell(i, 4).value) except ValueError: pass except TypeError: pass # interpolate T = f(timestamp) tstamp = [time.timestamp() for time in t] # interpolate, except for the first and last points that are extrapolated interpolator = scipy.interpolate.interp1d(tstamp, T, fill_value='extrapolate', assume_sorted=True) for ds in datasets: # timestamp of spectra for the thermocouple clock tstamp_ds = [(label[0] + delta_clocks).timestamp() for label in ds.y.labels] T_ds = interpolator(tstamp_ds) newlabels = np.hstack((ds.y.labels, T_ds.reshape((50, 1)))) ds.y = Coord(title=ds.y.title, data=ds.y.data, labels=newlabels) if len(datasets) == 1: return datasets[0] # a single dataset is returned # several datasets returned, sorted by sample # return sorted(datasets, key=lambda ds: int(re.split('-|_', ds.name)[0]))
def nmf(V, Winit, Hinit, tol, timelimit, maxiter): """ (W,H) = nmf(V,Winit,Hinit,tol,timelimit,maxiter) W,H : output solution Winit,Hinit : initial solution tol : tolerance for a relative stopping condition timelimit, maxiter : limit of time and iterations """ def nlssubprob(V, W, Hinit, tol, maxiter): """ H, grad : output solution and gradient iter : #iterations used V, W : constant matrices Hinit : initial solution tol : stopping tolerance maxiter : limit of iterations """ H = Hinit WtV = np.dot(W.T, V) WtW = np.dot(W.T, W) alpha = 1 beta = 0.1 for n_iter in range(1, maxiter + 1): grad = np.dot(WtW, H) - WtV if norm(grad * np.logical_or(grad < 0, H > 0)) < tol: break Hp = H # search step size for inner_iter in range(20): # gradient step Hn = H - alpha * grad # gradient step Hn *= Hn > 0 d = Hn - H gradd = np.dot(grad.ravel(), d.ravel()) dQd = np.dot(np.dot(WtW, d).ravel(), d.ravel()) suff_decr = 0.99 * gradd + 0.5 * dQd < 0 if inner_iter == 0: decr_alpha = not suff_decr Hp = H if decr_alpha: if suff_decr: H = Hn break else: alpha = alpha * beta else: if not suff_decr or (Hp == Hn).all(): H = Hp break else: alpha = alpha / beta Hp = Hn if n_iter == maxiter: info_('Max iter in nlssubprob') return H, grad, n_iter W = Winit H = Hinit initt = time() gradW = np.dot(W, np.dot(H, H.T)) - np.dot(V, H.T) gradH = np.dot(np.dot(W.T, W), H) - np.dot(W.T, V) initgrad = norm(np.r_[gradW, gradH.T]) info_('Init gradient norm {:.3f}'.format(initgrad)) tolW = max(0.001, tol) * initgrad tolH = tolW for myiter in range(1, maxiter): # stopping condition projnorm = norm(np.r_[gradW[np.logical_or(gradW < 0, W > 0)], gradH[np.logical_or(gradH < 0, H > 0)]]) if projnorm < tol * initgrad or time() - initt > timelimit: break (W, gradW, iterW) = nlssubprob(V.T, H.T, W.T, tolW, 10000) W = W.T gradW = gradW.T if iterW == 1: tolW = 0.1 * tolW (H, gradH, iterH) = nlssubprob(V, W, H, tolH, 10000) if iterH == 1: tolH = 0.1 * tolH if myiter % 10 == 0: stdout.write('.') info_('\nIter = {} Final proj-grad norm {:.3f}'.format( myiter, projnorm)) return W, H
def nmf(self, V, Winit, Hinit, tol, maxtime, maxiter): """ NMF by alternative non-negative least squares using projected gradients. Parameters ========== V: |ndarray| numpy array to be analysed Winit,Hinit: |ndarray| Initial solutions for concentration and spectral profile.. tol: float Tolerance for a relative stopping condition. maxtime: float Limit of time. maxiter: int Limit number for iterations. Returns ======= W,H: |ndarray| Output solution. """ W = Winit H = Hinit initt = time() gradW = np.dot(W, np.dot(H, H.T)) - np.dot(V, H.T) gradH = np.dot(np.dot(W.T, W), H) - np.dot(W.T, V) initgrad = norm(np.r_[gradW, gradH.T]) info_(f"Init gradient norm {initgrad:.3f}") tolW = max(0.001, tol) * initgrad tolH = tolW for myiter in range(1, maxiter): # stopping condition projnorm = norm( np.r_[ gradW[np.logical_or(gradW < 0, W > 0)], gradH[np.logical_or(gradH < 0, H > 0)], ] ) if projnorm < tol * initgrad or time() - initt > maxtime: break (W, gradW, iterW) = self.nlssubprob(V.T, H.T, W.T, tolW, 1000) W = W.T gradW = gradW.T if iterW == 1: tolW = 0.1 * tolW (H, gradH, iterH) = self.nlssubprob(V, W, H, tolH, 1000) if iterH == 1: tolH = 0.1 * tolH if myiter % 10 == 0: stdout.write(".") info_(f"\nIter = {myiter} Final proj-grad norm {projnorm:.3f}") return W, H
def _read_carroucell(*args, **kwargs): _, directory = args directory = get_directory_name(directory) if not directory: # pragma: no cover # probably cancel has been chosen in the open dialog info_("No directory was selected.") return spectra = kwargs.get("spectra", None) discardbg = kwargs.get("discardbg", True) delta_clocks = datetime.timedelta(seconds=kwargs.get("delta_clocks", 0)) datasets = [] # get the sorted list of spa files in the directory spafiles = sorted(get_filenames(directory, **kwargs)[".spa"]) spafilespec = [f for f in spafiles if "BCKG" not in f.stem] spafileback = [f for f in spafiles if "BCKG" in f.stem] # select files prefix = lambda f: f.stem.split("_")[0] number = lambda f: int(f.stem.split("_")[1]) if spectra is not None: [min, max] = spectra spafilespec = [f for f in spafilespec if min <= number(f) <= max] spafileback = [f for f in spafileback if min <= number(f) <= max] # discard BKG files spafiles = spafilespec if not discardbg: spafiles += spafileback # merge dataset with the same number curfilelist = [spafiles[0]] curprefix = prefix(spafiles[0]) for f in spafiles[1:]: if prefix(f) != curprefix: ds = NDDataset.read_omnic(curfilelist, sortbydate=True, directory=directory, name=curprefix) datasets.append(ds) curfilelist = [f] curprefix = prefix(f) else: curfilelist.append(f) ds = NDDataset.read_omnic(curfilelist, sortbydate=True, directory=directory, name=curprefix) datasets.append(ds) # Now manage temperature Tfile = sorted( [f for f in os.listdir(directory) if f[-4:].lower() == ".xls"]) if len(Tfile) == 0: print_("no temperature file") elif len(Tfile) > 1: warnings.warn( "several .xls/.csv files. The temperature will not be read") else: Tfile = Tfile[0] if Tfile[-4:].lower() == ".xls": book = xlrd.open_workbook(os.path.join(directory, Tfile)) # determine experiment start and end time (thermocouple clock) ti = datasets[0].y.labels[0][0] + delta_clocks tf = datasets[-1].y.labels[-1][0] + delta_clocks # get thermocouple time and T information during the experiment t = [] T = [] sheet = book.sheet_by_index(0) for i in range(9, sheet.nrows): try: time = datetime.datetime.strptime( sheet.cell(i, 0).value, "%d/%m/%y %H:%M:%S").replace( tzinfo=datetime.timezone.utc) if ti <= time <= tf: t.append(time) T.append(sheet.cell(i, 4).value) except ValueError: pass except TypeError: pass # interpolate T = f(timestamp) tstamp = [time.timestamp() for time in t] # interpolate, except for the first and last points that are extrapolated interpolator = scipy.interpolate.interp1d(tstamp, T, fill_value="extrapolate", assume_sorted=True) for ds in datasets: # timestamp of spectra for the thermocouple clock tstamp_ds = [(label[0] + delta_clocks).timestamp() for label in ds.y.labels] T_ds = interpolator(tstamp_ds) newlabels = np.hstack((ds.y.labels, T_ds.reshape((50, 1)))) ds.y = Coord(title=ds.y.title, data=ds.y.data, labels=newlabels) if len(datasets) == 1: return datasets[0] # a single dataset is returned # several datasets returned, sorted by sample # return sorted(datasets, key=lambda ds: re.split("-|_", ds.name)[0])
def __init__(self, dataset, **kwargs): super().__init__() # ------------------------------------------------------------------------ # Utility functions # ------------------------------------------------------------------------ def figures_of_merit(X, maxPIndex, C, St, j): # return %explained variance and stdev of residuals when the jth compound is added C[:, j] = X[:, maxPIndex[j]] St[0:j + 1, :] = np.linalg.lstsq(C.data[:, 0:j + 1], X.data, rcond=None)[0] Xhat = dot(C[:, 0:j + 1], St[0:j + 1, :]) res = Xhat - X stdev_res = np.std(res) rsquare = 1 - np.linalg.norm(res)**2 / np.linalg.norm(X)**2 return rsquare, stdev_res def str_iter_summary(j, index, coord, rsquare, stdev_res, diff): # return formatted list of figure of merits at a given iteration string = "{:4} {:5} {:8.1f} {:10.4f} {:10.4f} ".format( j + 1, index, coord, stdev_res, rsquare) return string def get_x_data(X): if X.x is not None and not X.x.is_empty: # TODO what about labels? return X.x.data else: return np.arange(X.shape[-1]) # ------------------------------------------------------------------------ # Check data # ------------------------------------------------------------------------ X = dataset if len(X.shape) != 2: raise ValueError("For now, SIMPLISMA only handles 2D Datasets") if np.min(X.data) < 0: warnings.warn("SIMPLISMA does not handle easily negative values.") # TODO: check whether negative values should be set to zero or not. if "verbose" in kwargs.keys(): warnings.warn( "verbose deprecated. Instead, use set_loglevel(INFO) before launching MCRALS", DeprecationWarning, ) set_loglevel(INFO) interactive = kwargs.get("interactive", False) tol = kwargs.get("tol", 0.1) noise = kwargs.get("noise", 3) n_pc = kwargs.get("n_pc", 2) if n_pc < 2 or not isinstance(n_pc, int): raise ValueError( "Oh you did not just... 'MA' in simplisMA stands for Mixture Analysis. " "The number of pure compounds should be an integer larger than 2" ) if interactive: n_pc = 100 # ------------------------------------------------------------------------ # Core # ------------------------------------------------------------------------ if not interactive: logs = "*** Automatic SIMPL(I)SMA analysis *** \n" else: logs = "*** Interactive SIMPLISMA analysis *** \n" logs += "dataset: {}\n".format(X.name) logs += " noise: {:2} %\n".format(noise) if not interactive: logs += " tol: {:2} %\n".format(tol) logs += " n_pc: {:2}\n".format(n_pc) logs += "\n" logs += "#iter index_pc coord_pc Std(res) R^2 \n" logs += "---------------------------------------------" info_(logs) logs += "\n" # Containers for returned objects and intermediate data # --------------------------------------------------- # purity 'spectra' (generally spectra if X is passed, # but could also be concentrations if X.T is passed) Pt = NDDataset.zeros((n_pc, X.shape[-1])) Pt.name = "Purity spectra" Pt.set_coordset(y=Pt.y, x=X.x) Pt.y.title = "# pure compound" # weight matrix w = NDDataset.zeros((n_pc, X.shape[-1])) w.set_coordset(y=Pt.y, x=X.x) # Stdev spectrum s = NDDataset.zeros((n_pc, X.shape[-1])) s.name = "Standard deviation spectra" s.set_coordset(y=Pt.y, x=X.x) # maximum purity indexes and coordinates maxPIndex = [0] * n_pc maxPCoordinate = [0] * n_pc # Concentration matrix C = NDDataset.zeros((X.shape[-2], n_pc)) C.name = "Relative Concentrations" C.set_coordset(y=X.y, x=C.x) C.x.title = "# pure compound" # Pure component spectral profiles St = NDDataset.zeros((n_pc, X.shape[-1])) St.name = "Pure compound spectra" St.set_coordset(y=Pt.y, x=X.x) # Compute Statistics # ------------------ sigma = np.std(X.data, axis=0) mu = np.mean(X.data, axis=0) alpha = (noise / 100) * np.max(mu.data) lamda = np.sqrt(mu**2 + sigma**2) p = sigma / (mu + alpha) # scale dataset Xscaled = X.data / np.sqrt(mu**2 + (sigma + alpha)**2) # COO dispersion matrix COO = (1 / X.shape[-2]) * np.dot(Xscaled.T, Xscaled) # Determine the purest variables j = 0 finished = False while not finished: # compute first purest variable and weights if j == 0: w[j, :] = lamda**2 / (mu**2 + (sigma + alpha)**2) s[j, :] = sigma * w[j, :] Pt[j, :] = p * w[j, :] # get index and coordinate of pure variable maxPIndex[j] = np.argmax(Pt[j, :].data) maxPCoordinate[j] = get_x_data(X)[maxPIndex[j]] # compute figures of merit rsquare0, stdev_res0 = figures_of_merit(X, maxPIndex, C, St, j) # add summary to log llog = str_iter_summary(j, maxPIndex[j], maxPCoordinate[j], rsquare0, stdev_res0, "") logs += llog + "\n" if interactive: print(llog) if interactive: # should plot purity and stdev, does not work for the moment # TODO: fix the code below # fig1, (ax1, ax2) = plt.subplots(2,1) # Pt[j, :].plot(ax=ax1) # ax1.set_title('Purity spectrum #{}'.format(j+1)) # ax1.axvline(maxPCoordinate[j], color='r') # s[j, :].plot(ax=ax2) # ax2.set_title('standard deviation spectrum #{}'.format(j+1)) # ax2.axvline(maxPCoordinate[j], color='r') # plt.show() ans = "" while ans.lower() not in ["a", "c"]: ans = input(" |--> (a) Accept, (c) Change: ") while ans.lower() != "a": new = input( " |--> enter the new index (int) or variable value (float): " ) try: new = int(new) maxPIndex[j] = new maxPCoordinate[j] = get_x_data(X)[maxPIndex[j]] except ValueError: try: new = float(new) maxPIndex[j] = np.argmin( abs(get_x_data(X) - new)) maxPCoordinate[j] = get_x_data(X)[maxPIndex[j]] except ValueError: print( "Incorrect answer. Please enter a valid index or value" ) rsquare0, stdev_res0 = figures_of_merit( X, maxPIndex, C, St, j) llog = str_iter_summary(j, maxPIndex[j], maxPCoordinate[j], rsquare0, stdev_res0, "") logs += " |--> changed pure variable #1" logs += llog + "\n" info_(llog) ans = input(" |--> (a) Accept, (c) Change: ") # ans was [a]ccept j += 1 if not interactive: j += 1 prev_stdev_res = stdev_res0 else: # compute jth purest variable for i in range(X.shape[-1]): Mji = np.zeros((j + 1, j + 1)) idx = [i] + maxPIndex[0:j] for line in range(j + 1): for col in range(j + 1): Mji[line, col] = COO[idx[line], idx[col]] w[j, i] = np.linalg.det(Mji) Pt[j:] = p * w[j, :] s[j, :] = sigma * w[j, :] # get index and coordinate of jth pure variable maxPIndex[j] = np.argmax(Pt[j, :].data) maxPCoordinate[j] = get_x_data(X)[maxPIndex[j]] # compute figures of merit rsquarej, stdev_resj = figures_of_merit(X, maxPIndex, C, St, j) diff = 100 * (stdev_resj - prev_stdev_res) / prev_stdev_res prev_stdev_res = stdev_resj # add summary to log llog = str_iter_summary(j, maxPIndex[j], maxPCoordinate[j], rsquarej, stdev_resj, diff) logs += llog + "\n" if interactive: info_(llog) if ( interactive ): # TODO: I suggest to use jupyter widgets for the interactivity! # should plot purity and stdev, does not work for the moment # TODO: fix the code below # ax1.clear() # ax1.set_title('Purity spectrum #{}'.format(j+1)) # Pt[j, :].plot(ax=ax1) # for coord in maxPCoordinate[:-1]: # ax1.axvline(coord, color='g') # ax1.axvline(maxPCoordinate[j], color='r') # ax2.clear() # ax2.set_title('standard deviation spectrum #{}'.format(j+1)) # s[j, :].plot(ax=ax2) # for coord in maxPCoordinate[:-1]: # ax2.axvline(coord, color='g') # ax2.axvline(maxPCoordinate[j], color='r') # plt.show() ans = "" while ans.lower() not in ["a", "c", "r", "f"]: ans = input( " |--> (a) Accept and continue, (c) Change, (r) Reject, (f) Accept and finish: " ) while ans.lower() == "c": new = input( " |--> enter the new index (int) or variable value (float): " ) try: new = int(new) maxPIndex[j] = new maxPCoordinate[j] = get_x_data(X)[maxPIndex[j]] except ValueError: try: new = float(new) maxPIndex[j] = np.argmin( abs(get_x_data(X) - new)) maxPCoordinate[j] = get_x_data(X)[maxPIndex[j]] except ValueError: print( " |--> Incorrect answer. Please enter a valid index or value" ) rsquarej, stdev_resj = figures_of_merit( X, maxPIndex, C, St, j) diff = 100 * (stdev_resj - prev_stdev_res) / prev_stdev_res prev_stdev_res + stdev_resj logs += f" |--> changed pure variable #{j + 1}\n" llog = str_iter_summary( j, maxPIndex[j], maxPCoordinate[j], rsquarej, stdev_resj, "diff", ) logs += llog + "\n" info_(llog) info_( f"purest variable #{j + 1} set at index = {maxPIndex[j]} ; x = {maxPCoordinate[j]}" ) ans = input( " |--> (a) Accept and continue, (c) Change, (r) Reject, (f) Accept and stop: " ) if ans.lower() == "r": maxPCoordinate[j] = 0 maxPIndex[j] = 0 logs += f" |--> rejected pure variable #{j + 1}\n" j = j - 1 elif ans.lower() == "a": j = j + 1 elif ans.lower() == "f": finished = True j = j + 1 llog = f"\n**** Interrupted by user at compound # {j} \n**** End of SIMPL(I)SMA analysis." logs += llog + "\n" Pt = Pt[0:j, :] St = St[0:j, :] s = s[0:j, :] C = C[:, 0:j] # not interactive else: j = j + 1 if (1 - rsquarej) < tol / 100: llog = ( f"\n**** Unexplained variance lower than 'tol' ({tol}%) \n" "**** End of SIMPL(I)SMA analysis.") logs += llog + "\n" Pt = Pt[0:j, :] St = St[0:j, :] s = s[0:j, :] C = C[:, 0:j] info_(llog) finished = True if j == n_pc: if not interactive: llog = ( f"\n**** Reached maximum number of pure compounds 'n_pc' ({n_pc}) \n" "**** End of SIMPL(I)SMA analysis.") logs += llog + "\n" info_(llog) finished = True Pt.description = "Purity spectra from SIMPLISMA:\n" + logs C.description = "Concentration/contribution matrix from SIMPLISMA:\n" + logs St.description = "Pure compound spectra matrix from SIMPLISMA:\n" + logs s.description = "Standard deviation spectra matrix from SIMPLISMA:\n" + logs self._logs = logs self._X = X self._Pt = Pt self._C = C self._St = St self._s = s
def __init__(self, dataset, guess, **kwargs): # lgtm [py/missing-call-to-init] """ Parameters ---------- dataset : |NDDataset| The dataset on which to perform the MCR-ALS analysis guess : |NDDataset| Initial concentration or spectra verbose : bool If set to True, prints a summary of residuals and residuals change at each iteration. default = False. In any case, the same information is returned in self.logs **kwargs : dict Optimization parameters : See Other Parameters. Other Parameters ---------------- tol : float, optional, default=0.1 Convergence criterion on the change of resisuals. (percent change of standard deviation of residuals). maxit : int, optional, default=50 Maximum number of ALS minimizations. maxdiv : int, optional, default=5. Maximum number of successive non-converging iterations. nonnegConc : list or tuple, default=Default [0, 1, ...] (only non-negative concentrations) Index of species having non-negative concentration profiles. For instance [0, 2] indicates that species #0 and #2 have non-negative conc profiles while species #1 can have negative concentrations. unimodConc : list or tuple, Default=[0, 1, ...] (only unimodal concentration profiles) index of species having unimodal concentrationsprofiles. closureConc : list or tuple, Default=None (no closure) Index of species subjected to a closure constraint. externalConc: list or tuple, Default None (no external concentration). Index of species for which a concentration profile is provided by an external function. getExternalConc : callable An external function that will provide `n_ext` concentration profiles: getExternalConc(C, extConc, ext_to_C_idx, *args) -> extC or etExternalConc(C, extConc, ext_to_C_idx, *args) -> (extC, out2, out3, ...) where C is the current concentration matrix, *args are the parameters needed to completely specify the function, extC is a nadarray or NDDataset of shape (C.y, n_ext), and out1, out2, ... are supplementary outputs returned by the function (e.g. optimized rate parameters) args : tuple, optional. Extra arguments passed to the external function external_to_C_idx : array or tuple, Default=np.arange(next) Indicates the correspondence between the indexes of external chemical profiles and the columns of the C matrix. [1, None, 0] indicates that the first external profile is the second pure species (index 1). nonnegSpec : list or tuple, Default [1, ..., 1] (only non-negative spectra) Indicates species having non-negative spectra unimodSpec : list or tuple, Default [0, ..., 0] (no unimodal concentration profiles) Indicates species having unimodal spectra """ verbose = kwargs.pop('verbose', False) if verbose: set_loglevel(INFO) # Check initial data # ------------------------------------------------------------------------ initConc, initSpec = False, False if type(guess) is np.ndarray: guess = NDDataset(guess) X = dataset if X.shape[0] == guess.shape[0]: initConc = True C = guess.copy() C.name = 'Pure conc. profile, mcs-als of ' + X.name nspecies = C.shape[1] elif X.shape[1] == guess.shape[1]: initSpec = True St = guess.copy() St.name = 'Pure spectra profile, mcs-als of ' + X.name nspecies = St.shape[0] else: raise ValueError('the dimensions of initial concentration ' 'or spectra dataset do not match the data') ny, nx = X.shape # makes a PCA with same number of species Xpca = PCA(X).reconstruct(n_pc=nspecies) # Get optional parameters in kwargs or set them to their default # ------------------------------------------------------------------------ # TODO: make a preference file to set this kwargs # optimization tol = kwargs.get('tol', 0.1) maxit = kwargs.get('maxit', 50) maxdiv = kwargs.get('maxdiv', 5) # constraints on concentrations nonnegConc = kwargs.get('nonnegConc', np.arange(nspecies)) unimodConc = kwargs.get('unimodConc', np.arange(nspecies)) unimodTol = kwargs.get('unimodTol', 1.1) unimodMod = kwargs.get('unimodMod', 'strict') closureConc = kwargs.get('closureConc', None) if closureConc is not None: closureTarget = kwargs.get('closureTarget', np.ones(ny)) closureMethod = kwargs.get('closureMethod', 'scaling') monoDecConc = kwargs.get('monoDecConc', None) monoDecTol = kwargs.get('monoDecTol', 1.1) monoIncConc = kwargs.get('monoIncConc', None) monoIncTol = kwargs.get('monoIncTol', 1.1) externalConc = kwargs.get('externalConc', None) if externalConc is not None: external_to_C_idx = kwargs.get('external_to_C_idx', np.arange(nspecies)) if externalConc is not None: try: getExternalConc = kwargs.get('getExternalConc') except Exception: raise ValueError('A function must be given to get the external concentration profile(s)') external_to_C_idx = kwargs.get('external_to_C_idx', externalConc) args = kwargs.get('args', ()) # constraints on spectra nonnegSpec = kwargs.get('nonnegSpec', np.arange(nspecies)) normSpec = kwargs.get('normSpec', None) # TODO: add unimodal constraint on spectra # Compute initial spectra or concentrations (first iteration...) # ------------------------------------------------------------------------ if initConc: if C.coordset is None: C.set_coordset(y=X.y, x=C.x) St = NDDataset(np.linalg.lstsq(C.data, X.data, rcond=None)[0]) St.name = 'Pure spectra profile, mcs-als of ' + X.name St.title = X.title cy = C.x.copy() if C.x else None cx = X.x.copy() if X.x else None St.set_coordset(y=cy, x=cx) if initSpec: if St.coordset is None: St.set_coordset(y=St.y, x=X.x) Ct = np.linalg.lstsq(St.data.T, X.data.T, rcond=None)[0] C = NDDataset(Ct.T) C.name = 'Pure conc. profile, mcs-als of ' + X.name C.title = 'concentration' cx = St.y.copy() if St.y else None cy = X.y.copy() if X.y else None C.set_coordset(y=cy, x=cx) change = tol + 1 stdev = X.std() # .data[0] niter = 0 ndiv = 0 logs = '*** ALS optimisation log***\n' logs += '#iter Error/PCA Error/Exp %change\n' logs += '---------------------------------------------------' info_(logs) while change >= tol and niter < maxit and ndiv < maxdiv: C.data = np.linalg.lstsq(St.data.T, X.data.T, rcond=None)[0].T niter += 1 # Force non-negative concentration # -------------------------------- if nonnegConc is not None: for s in nonnegConc: C.data[:, s] = C.data[:, s].clip(min=0) # Force unimodal concentration # ---------------------------- if unimodConc is not None: for s in unimodConc: maxid = np.argmax(C.data[:, s]) curmax = C.data[maxid, s] curid = maxid while curid > 0: curid -= 1 if C.data[curid, s] > curmax * unimodTol: if unimodMod == 'strict': C.data[curid, s] = C.data[curid + 1, s] if unimodMod == 'smooth': C.data[curid, s] = (C.data[curid, s] + C.data[ curid + 1, s]) / 2 C.data[curid + 1, s] = C.data[curid, s] curid = curid + 2 curmax = C.data[curid, s] curid = maxid while curid < ny - 1: curid += 1 if C.data[curid, s] > curmax * unimodTol: if unimodMod == 'strict': C.data[curid, s] = C.data[curid - 1, s] if unimodMod == 'smooth': C.data[curid, s] = (C.data[curid, s] + C.data[ curid - 1, s]) / 2 C.data[curid - 1, s] = C.data[curid, s] curid = curid - 2 curmax = C.data[curid, s] # Force monotonic increase # ------------------------ if monoIncConc is not None: for s in monoIncConc: for curid in np.arange(ny - 1): if C.data[curid + 1, s] < C.data[curid, s] / monoIncTol: C.data[curid + 1, s] = C.data[curid, s] # Force monotonic decrease # ---------------------------------------------- if monoDecConc is not None: for s in monoDecConc: for curid in np.arange(ny - 1): if C.data[curid + 1, s] > C.data[curid, s] * monoDecTol: C.data[curid + 1, s] = C.data[curid, s] # Closure # ------------------------------------------ if closureConc is not None: if closureMethod == 'scaling': Q = np.linalg.lstsq(C.data[:, closureConc], closureTarget.T, rcond=None)[0] C.data[:, closureConc] = np.dot(C.data[:, closureConc], np.diag(Q)) elif closureMethod == 'constantSum': totalConc = np.sum(C.data[:, closureConc], axis=1) C.data[:, closureConc] = C.data[:, closureConc] * closureTarget[:, None] / totalConc[:, None] # external concentration profiles # ------------------------------------------ if externalConc is not None: extOutput = getExternalConc(*((C, externalConc, external_to_C_idx,) + args)) if isinstance(extOutput, dict): extC = extOutput['concentrations'] args = extOutput['new_args'] else: extC = extOutput if type(extC) is NDDataset: extC = extC.data C.data[:, externalConc] = extC[:, external_to_C_idx] # stores C in C_hard Chard = C.copy() # compute St St.data = np.linalg.lstsq(C.data, X.data, rcond=None)[0] # stores St in Stsoft Stsoft = St.copy() # Force non-negative spectra # -------------------------- if nonnegSpec is not None: St.data[nonnegSpec, :] = St.data[nonnegSpec, :].clip(min=0) # recompute C for consistency(soft modeling) C.data = np.linalg.lstsq(St.data.T, X.data.T, rcond=None)[0].T # rescale spectra & concentrations if normSpec == 'max': alpha = np.max(St.data, axis=1).reshape(nspecies, 1) St.data = St.data / alpha C.data = C.data * alpha.T elif normSpec == 'euclid': alpha = np.linalg.norm(St.data, axis=1).reshape(nspecies, 1) St.data = St.data / alpha C.data = C.data * alpha.T # compute residuals # ----------------- X_hat = dot(C, St) stdev2 = (X_hat - X.data).std() change = 100 * (stdev2 - stdev) / stdev stdev = stdev2 stdev_PCA = (X_hat - Xpca.data).std() # TODO: Check PCA : values are different from the Arnaud version ? logentry = '{:3d} {:10f} {:10f} {:10f}'.format(niter, stdev_PCA, stdev2, change) logs += logentry + '\n' info_(logentry) if change > 0: ndiv += 1 else: ndiv = 0 change = -change if change < tol: logentry = 'converged !' logs += logentry + '\n' info_(logentry) if ndiv == maxdiv: logline = f"Optimization not improved since {maxdiv} iterations... unconverged " \ f"or 'tol' set too small ?\n" logline += 'Stop ALS optimization' logs += logline + '\n' info_(logline) if niter == maxit: logline = 'Convergence criterion (\'tol\') not reached after {:d} iterations.'.format(maxit) logline += 'Stop ALS optimization' logs += logline + '\n' info_(logline) self._X = X self._params = kwargs self._C = C if externalConc is not None: self._extC = extC self._extOutput = extOutput else: self._extC = None self._extOutput = None self._St = St self._logs = logs self._Stsoft = Stsoft self._Chard = Chard
def _interpret(self, script): """ Interpreter of the script content. """ # init some flags modlabel = None common = False fixed = False reference = False # create a new FitParameters instance fp = FitParameters() # set the number of experiments fp.expnumber = len(self.datasets) info_(f"The number of experiment(s) is set to {fp.expnumber}") # start interpreting ------------------------------------------------------ lines = script.split("\n") lc = 0 for item in lines: lc += 1 # -------------- count the lines line = item.strip() if line == "" or line.startswith("#"): # this is a blank or comment line, go to next line continue # split around the semi-column s = line.split(":") if len(s) != 2: raise ValueError( f"Cannot interpret line {lc}: A semi-column is missing?") key, values = s key = key.strip().lower() if key.startswith("model"): modlabel = values.lower().strip() if modlabel not in fp.models: fp.models.append(modlabel) common = False continue elif key.startswith("common") or key.startswith("vars"): common = True modlabel = "common" continue elif key.startswith("shape"): shape = values.lower().strip() if ( shape is None ): # or (shape not in self._list_of_models and shape not in self._list_of_baselines): raise ValueError( f"Shape of this model `{shape}` was not specified or is not implemented" ) fp.model[modlabel] = shape common = False continue elif key.startswith("experiment"): # must be in common if not common: raise ValueError( "'experiment_...' specification was found outside the common block." ) if "variables" in key: expvars = values.lower().strip() expvars = expvars.replace(",", " ").replace(";", " ") expvars = expvars.split() fp.expvars.extend(expvars) continue else: if modlabel is None and not common: raise ValueError( "The first definition should be a label for a model or a block of variables or constants." ) # get the parameters if key.startswith("*"): fixed = True reference = False key = key[1:].strip() elif key.startswith("$"): fixed = False reference = False key = key[1:].strip() elif key.startswith(">"): fixed = True reference = True key = key[1:].strip() else: raise ValueError( f"Cannot interpret line {lc}: A parameter definition must start with *,$ or >" ) # store this parameter s = values.split(",") s = [ss.strip() for ss in s] if len(s) > 1 and ("[" in s[0]) and ("]" in s[1]): # list s[0] = "%s, %s" % (s[0], s[1]) if len(s) > 2: s[1:] = s[2:] if len(s) > 3: raise ValueError( f"line {lc}: value, min, max should be defined in this order" ) elif len(s) == 2: raise ValueError(f"only two items in line {lc}") # s.append('none') elif len(s) == 1: s.extend(["none", "none"]) value, mini, maxi = s if mini.strip().lower() in ["none", ""]: mini = str(-1.0 / sys.float_info.epsilon) if maxi.strip().lower() in ["none", ""]: maxi = str(+1.0 / sys.float_info.epsilon) if modlabel != "common": ks = f"{key}_{modlabel}" fp.common[key] = False else: ks = f"{key}" fp.common[key] = True fp.reference[ks] = reference if not reference: val = value.strip() val = eval(val) if isinstance(val, list): # if the parameter is already a list, that's ok if the number of parameters is ok if len(val) != fp.expnumber: raise ValueError( f"the number of parameters {len(val)} is not the number of experiments." ) if key not in fp.expvars: raise ValueError( f"parameter {key} is not declared as variable") else: if key in fp.expvars: # we create a list of parameters corresponding val = [val] * fp.expnumber fp[ks] = val, mini.strip(), maxi.strip(), fixed else: fp[ks] = value.strip() return fp
def run(self, maxiter=100, maxfun=None, every=10, method='simplex', **kwargs): """ Main fitting procedure Parameters ---------- maxiter : int, maximum number of iteration maxfun : int, maximum number of function calls every : int, number of function call between two displays method : str, ether 'simplex' or 'hopping' dryrun : bool """ if not self.silent: level = preferences.log_level if level > INFO: preferences.log_level = INFO info_('*' * 50) info_(' Entering fitting procedure') info_('*' * 50) global niter, chi2, everyiter, ncalls ncalls = 0 everyiter = every niter = 0 # internally defined function chi2 def funchi2(params, datasets, *constraints): """ Return sum((y - x)**2) """ global chi2, ncalls # model spectrum chi2 = 0 som = 0 ncalls += 1 for exp_idx, dataset in enumerate(datasets): modeldata = self._get_modeldata(dataset, exp_idx)[0] # baseline is already summed with modeldata[-1] # important to work with the real component of dataset # not the complex number data = dataset.real.data.squeeze() # if not dataset.is_2d: mdata = modeldata[-1] # modelsum # else: # mdata = modeldata.values merror = 1. # if dataset.is_2d: # if constraints: # # # Case of SQ-DQ experiments # if self.kind == 'SQ-DQ' and \ # 'max_connections' in constraints[0]: # # check connectivity numbers # nbconnections = {} # for key in params.keys(): # if 'pos1' in key: # connect = key[-2:] # key = 'ampl_line_' + connect # get amplitude # ki = connect[0].upper() # if ki not in nbconnections.keys(): # nbconnections[ki] = 0 # if int(params[key]) > 0: # nbconnections[ki] += 1 # for k, v in nbconnections.iteritems(): # if v > constraints[0]['max_connections']: # merror *= v * 10. diff = data - mdata chi2 += np.sum(diff**2) * merror som += np.sum(data[0]**2) chi2 = np.sqrt(chi2 / som) # reset log_level return chi2 # end chi2 function --------------------------------------------------- # callback function-------------------------------------------------------- def callback(*args, **kwargs): """ callback log.info function """ global niter, chi2, everyiter, ncalls niter += 1 if niter % everyiter != 0: return if not self.silent: display.clear_output(wait=True) info_(("Iterations: %d, Calls: %d (chi2: %.5f)" % (niter, ncalls, chi2))) sys.stdout.flush() # end callback function --------------------------------------------------- fp = self.fp # starting parameters dry = kwargs.get("dry", False) if not dry: fp, fopt = optimize(funchi2, fp, args=(self.datasets, ), maxfun=maxfun, maxiter=maxiter, method=method, constraints=kwargs.get('constraints', None), callback=callback) # replace the previous script with new fp parameters self.parameterscript.script = str(fp) if not self.silent: # log.info the results info_("\n") info_('*' * 50) if not dry: info_(" Result:") else: info_(" Starting parameters:") info_('*' * 50) info_(self.parameterscript.script) # store the models for exp_idx, dataset in enumerate(self.datasets): dataset.modeldata, dataset.modelnames, dataset.model_A, dataset.model_a, dataset.model_b = \ self._get_modeldata(dataset, exp_idx) # Reset Log_level if not self.silent: preferences.log_level = level return
def download_nist_ir(CAS, index="all"): """ Upload IR spectra from NIST webbook Parameters ---------- CAS : int or str the CAS number, can be given as "XXXX-XX-X" (str), "XXXXXXX" (str), XXXXXXX (int) index : str or int or tuple of ints If set to 'all' (default, import all available spectra for the compound corresponding to the index, or a single spectrum, or selected spectra. Returns ------- list of NDDataset or NDDataset The dataset(s). See Also -------- read : Read data from experimental data. """ if isinstance(CAS, str) and "-" in CAS: CAS = CAS.replace("-", "") if index == "all": # test urls and return list if any... index = [] i = 0 while "continue": url = ( f"https://webbook.nist.gov/cgi/cbook.cgi?JCAMP=C{CAS}&Index={i}&Type=IR" ) try: response = requests.get(url, timeout=10) if b"Spectrum not found" in response.content[:30]: break else: index.append(i) i += 1 except OSError: error_("OSError: could not connect to NIST") return None if len(index) == 0: error_("NIST IR: no spectrum found") return elif len(index) == 1: info_("NIST IR: 1 spectrum found") else: info_("NISTR IR: {len(index)} spectra found") elif isinstance(index, int): index = [index] elif not is_iterable(index): raise ValueError("index must be 'all', int or iterable of int") out = [] for i in index: # sample adress (water, spectrum 1) # https://webbook.nist.gov/cgi/cbook.cgi?JCAMP=C7732185&Index=1&Type=IR url = f"https://webbook.nist.gov/cgi/cbook.cgi?JCAMP=C{CAS}&Index={i}&Type=IR" try: response = requests.get(url, stream=True, timeout=10) if b"Spectrum not found" in response.content[:30]: error_( f"NIST IR: Spectrum {i} does not exist... please check !") if i == index[-1] and out == []: return None else: break except OSError: error_("OSError: Cannot connect... ") return None # Load data txtdata = "" for rd in response.iter_content(): txtdata += rd.decode("utf8") with open("temp.jdx", "w") as f: f.write(txtdata) try: ds = read_jcamp("temp.jdx") # replace the default entry ":imported from jdx file": ds.history[0] = ds.history[0][:len(str(datetime.now( timezone.utc)))] + (f" : downloaded from NIST: {url}\n") out.append(ds) (Path(".") / "temp.jdx").unlink() except Exception: raise OSError( "Can't read this JCAMP file: please report the issue to Spectrochempy developpers" ) if len(out) == 1: return out[0] else: return out