def assert_almost_equal_inf(x, y, decimal=6, msg=None): x = np.atleast_1d(x) y = np.atleast_1d(y) assert_equal(np.isposinf(x), np.isposinf(y)) assert_equal(np.isneginf(x), np.isneginf(y)) assert_equal(np.isnan(x), np.isnan(y)) assert_almost_equal(x[np.isfinite(x)], y[np.isfinite(y)])
def compute_frequencydomaincoef(x, data_length_sec, sampling_frequency, nfreq_bands, win_length_sec, stride_sec): n_channels = x.shape[1] n_timesteps = (data_length_sec - win_length_sec) / stride_sec + 1 n_fbins = nfreq_bands xfreq = np.zeros((n_timesteps, 136)) x2 = np.zeros((n_channels, n_fbins, n_timesteps)) for i in range(n_channels): xc = np.zeros((n_fbins, n_timesteps)) for frame_num, w in enumerate(range(0, data_length_sec - win_length_sec + 1, stride_sec)): xw = x[w * sampling_frequency: (w + win_length_sec) * sampling_frequency,i]#window fft = np.log10(np.absolute(np.fft.rfft(xw))) fft_freq = np.fft.rfftfreq(n=xw.shape[-1], d=1.0 / sampling_frequency) ## return the FFT sample frequency xc[:nfreq_bands, frame_num] = group_into_bands(fft, fft_freq, nfreq_bands) x2[i, :, :] = xc for j in range(n_timesteps): x2[:, :, j][np.isneginf(x2[:, :, j])] = 0 scaled = preprocessing.scale(x2[:, :, j], axis=0) matrix = CorrelationMatrix().apply(scaled) matrix[np.isneginf(matrix)] = 0 matrix[np.isnan(matrix)] = 0 eigenvalue = Eigenvalues().apply(matrix) freqdomaincor = upper_right_triangle(matrix) xfreq[j, :] = np.concatenate((freqdomaincor, eigenvalue)) xfreq[np.isneginf(xfreq)] = 0 xfreq[np.isnan(xfreq)] = 0 return xfreq
def min_sum_diffs(filename, args): """Sum of the differences (in dB) between responses and a reference response. Args: filename (str): Name of output file args (dict): 'refresp' key with path & filename of reference response; 'outputs' key with a list of names (IDs) of outputs (rxs) from input file Returns: diffdB (float): Sum of the differences (in dB) between responses and a reference response """ # Load (from gprMax output file) the reference response f = h5py.File(args['refresp'], 'r') tmp = f['/rxs/rx1/'] fieldname = list(tmp.keys())[0] refresp = np.array(tmp[fieldname]) # Load (from gprMax output file) the response f = h5py.File(filename, 'r') nrx = f.attrs['nrx'] diffdB = 0 outputs = 0 for rx in range(1, nrx + 1): output = f['/rxs/rx' + str(rx) + '/'] if output.attrs['Name'] in args['outputs']: outputname = list(output.keys())[0] modelresp = np.array(output[outputname]) # Calculate sum of differences tmp = 20 * np.log10(np.abs(modelresp - refresp) / np.amax(np.abs(refresp))) tmp = np.abs(np.sum(tmp[-np.isneginf(tmp)])) / len(tmp[-np.isneginf(tmp)]) diffdB += tmp outputs += 1 return diffdB / outputs
def non_matches(arr, val): ''' Given a ndarray and an arbitrary value, including np.nan, np.inf, etc., return an ndarray that contains only elements that are *not* equal to val. :param arr: n-dimensional numpy array :type arr: numpy.ndarray :param val: value, including special values numpy.nan, numpy.inf, numpy.neginf, etc. :type val: ANY. ''' # Special value? if np.isfinite(val): # No, just normal value: return arr[arr != val] # Is special value, such as numpy.nan. # Create ndarray with True/False entries # that reflect which entries are not equal # to val: elif np.isnan(val): cond = np.logical_not(np.isnan(arr)) elif np.isinf(val): cond = np.logical_not(np.isinf(arr)) elif np.isneginf(val): cond = np.logical_not(np.isneginf(arr)) elif np.isposinf(val): cond = np.logical_not(np.isposinf(arr)) # Use the True/False ndarray as a mask # over arr: return arr[cond]
def imagesDiffer(imageArr1, imageArr2, skipMaskArr=None, rtol=1.0e-05, atol=1e-08): """Compare the pixels of two image arrays; return True if close, False otherwise Inputs: - image1: first image to compare - image2: second image to compare - skipMaskArr: pixels to ignore; nonzero values are skipped - rtol: relative tolerance (see below) - atol: absolute tolerance (see below) rtol and atol are positive, typically very small numbers. The relative difference (rtol * abs(b)) and the absolute difference "atol" are added together to compare against the absolute difference between "a" and "b". Return a string describing the error if the images differ significantly, an empty string otherwise """ retStrs = [] if skipMaskArr != None: maskedArr1 = numpy.ma.array(imageArr1, copy=False, mask = skipMaskArr) maskedArr2 = numpy.ma.array(imageArr2, copy=False, mask = skipMaskArr) filledArr1 = maskedArr1.filled(0.0) filledArr2 = maskedArr2.filled(0.0) else: filledArr1 = imageArr1 filledArr2 = imageArr2 nan1 = numpy.isnan(filledArr1) nan2 = numpy.isnan(filledArr2) if numpy.any(nan1 != nan2): retStrs.append("NaNs differ") posinf1 = numpy.isposinf(filledArr1) posinf2 = numpy.isposinf(filledArr2) if numpy.any(posinf1 != posinf2): retStrs.append("+infs differ") neginf1 = numpy.isneginf(filledArr1) neginf2 = numpy.isneginf(filledArr2) if numpy.any(neginf1 != neginf2): retStrs.append("-infs differ") # compare values that should be comparable (are neither infinite, nan nor masked) valSkipMaskArr = nan1 | nan2 | posinf1 | posinf2 | neginf1 | neginf2 if skipMaskArr != None: valSkipMaskArr |= skipMaskArr valMaskedArr1 = numpy.ma.array(imageArr1, copy=False, mask = valSkipMaskArr) valMaskedArr2 = numpy.ma.array(imageArr2, copy=False, mask = valSkipMaskArr) valFilledArr1 = valMaskedArr1.filled(0.0) valFilledArr2 = valMaskedArr2.filled(0.0) if not numpy.allclose(valFilledArr1, valFilledArr2, rtol=rtol, atol=atol): errArr = numpy.abs(valFilledArr1 - valFilledArr2) maxErr = errArr.max() maxPosInd = numpy.where(errArr==maxErr) maxPosTuple = (maxPosInd[1][0], maxPosInd[0][0]) errStr = "maxDiff=%s at position %s; value=%s vs. %s" % \ (maxErr, maxPosTuple, valFilledArr1[maxPosInd][0], valFilledArr2[maxPosInd][0]) retStrs.insert(0, errStr) return "; ".join(retStrs)
def figS4(data_dir=mydir, figname = 'FigS4', saveAs = 'eps'): models = ['lognorm', 'mete', 'zipf'] fig = plt.figure() count = 0 gs = gridspec.GridSpec(4, 4) #gs.update(wspace=0.1, hspace=0.1) for i in range(0, 4, 2): for j in range(0, 4, 2): if count < 2: ax = plt.subplot(gs[i:i+2, j:j+2], adjustable='box-forced') count += 1 else: ax = plt.subplot(gs[i:i+2, 1:3], adjustable='box-forced') if i == 0 and j == 0: NSR2 = importData.import_NSR2_data(data_dir + \ 'data/NSR2/Stratified/lognorm_pln_NSR2_stratify.txt') ax.set_title("Lognormal", fontsize = 18) ll = np.asarray(list(((NSR2["ll"])))) ll = ll[np.isneginf(ll) == False] print 'Lognorm: mean = ' + str(np.mean(ll)) + ' std = ' + str(np.std(ll)) elif i == 0 and j == 2: NSR2 = importData.import_NSR2_data(data_dir + \ 'data/NSR2/Stratified/zipf_mle_NSR2_stratify.txt') ax.set_title("Zipf", fontsize = 18) ll = np.asarray(list(((NSR2["ll"])))) ll = ll[np.isneginf(ll) == False] print 'Zipf: mean = ' + str(np.mean(ll)) + ' std = ' + str(np.std(ll)) elif i == 2 and j == 0: NSR2 = importData.import_NSR2_data(data_dir + \ 'data/NSR2/Stratified/mete_NSR2_stratify.txt') ax.set_title("Log-series", fontsize = 18) ll = np.asarray(list(((NSR2["ll"])))) ll = ll[np.isneginf(ll) == False] print 'Log-series: mean = ' + str(np.mean(ll)) + ' std = ' + str(np.std(ll)) else: continue ax.set( adjustable='box-forced') KDE = mo.CV_KDE(ll) #ax.hist(ll, 30, fc='gray', histtype='stepfilled', alpha=0.5, normed=True) ax.plot(KDE[0], KDE[1], linewidth=3, alpha=0.8 , color = 'blue') ax.yaxis.set_major_formatter(mticker.FormatStrFormatter('%.0E')) ax.xaxis.set_major_formatter(mticker.FormatStrFormatter('%.0E')) ax.set_xlim([min(KDE[0]), 0]) plt.xticks(fontsize = 7) plt.yticks(fontsize = 7) ax.set_xlabel('Log-likelihood', fontsize = 16) ax.set_ylabel('Probability density', fontsize = 14) plt.setp(ax.get_xticklabels()[::2], visible=False) plt.setp(ax.get_yticklabels()[::2], visible=False) fig_name = str(mydir + 'figures/' + figname + '_RGB.' + saveAs) fig.subplots_adjust(left=0.1, bottom = 0.1,hspace=0.1) fig.tight_layout()#pad=1.2, w_pad=0.8, h_pad=0.8 #fig.text(0.50, 0.017, 'Log-likelihood', ha='center', va='center', fontsize=15) #fig.text(0.04, 0.5, 'Probability', ha='center', va='center', rotation='vertical', fontsize=20) plt.savefig(fig_name, dpi=600, format = saveAs) plt.close()
def test_infinity_neg(self): x = -numpy.inf y = self.sendAndReceive(x) self.assertEqual(y, x) self.assert_(numpy.isneginf(x)) self.assert_(numpy.isneginf(y)) self.assertEqual(numpy.array(x).shape, y.shape) self.assertEqual(numpy.array(x).dtype, y.dtype)
def _transform_data(pdata, levels, data_transform): """ Return [pdata,plotlev,plotlab,extend,trans_base_list]; if data_transform == False, trans_base_list = None. Notes: ------ pdata: data used for contourf plotting. plotlev: the levels used in contourf plotting. extend: the value for parameter extand in contourf. trans_base_list: cf. mathex.plot_array_transg """ if levels == None: ftuple = (pdata, None, None, "neither") if data_transform == True: raise Warning("Strange levels is None but data_transform is True") else: if data_transform == True: # make the data transform before plotting. pdata_trans, plotlev, plotlab, trans_base_list = mathex.plot_array_transg(pdata, levels, copy=True) if np.isneginf(plotlab[0]) and np.isposinf(plotlab[-1]): ftuple = (pdata_trans, plotlev[1:-1], plotlab, "both") elif np.isneginf(plotlab[0]) or np.isposinf(plotlab[-1]): raise ValueError( """only one extreme set as infinitive, please set both as infinitive if arrow colorbar is wanted.""" ) else: ftuple = (pdata_trans, plotlev, plotlab, "neither") # data_transform==False else: plotlev = pb.iteflat(levels) plotlab = pb.iteflat(levels) if np.isneginf(plotlab[0]) and np.isposinf(plotlab[-1]): # here the levels would be like [np.NINF,1,2,3,np.PINF] # in following contourf, all values <1 and all values>3 will be # automatically plotted in the color of two arrows. # easy to see in this example: # a=np.tile(np.arange(10),10).reshape(10,10); # fig,ax=g.Create_1Axes(); # cs=ax.contourf(a,levels=np.arange(2,7),extend='both'); # plt.colorbar(cs) ftuple = (pdata, plotlev[1:-1], plotlab, "both") elif np.isneginf(plotlab[0]) or np.isposinf(plotlab[-1]): raise ValueError( """only one extreme set as infinitive, please set both as infinitive if arrow colorbar is wanted.""" ) else: ftuple = (pdata, plotlev, plotlab, "neither") datalist = list(ftuple) if data_transform == True: datalist.append(trans_base_list) else: datalist.append(None) return datalist
def _generate_colorbar_ticks_label( data_transform=False, colorbarlabel=None, trans_base_list=None, forcelabel=None, plotlev=None, plotlab=None ): """ Return (colorbar_ticks,colorbar_labels) """ # data_transform==True and levels!=None if data_transform == True: if colorbarlabel != None: colorbarlabel = pb.iteflat(colorbarlabel) transformed_colorbarlabel_ticks, x, y, trans_base_list = mathex.plot_array_transg( colorbarlabel, trans_base_list, copy=True ) # Note if/else blocks are organized in 1st tire by check if the two # ends are -inf/inf and 2nd tire by check if colorbarlabel is None if np.isneginf(plotlab[0]) and np.isposinf(plotlab[-1]): if colorbarlabel != None: ftuple = (transformed_colorbarlabel_ticks, colorbarlabel) else: ftuple = (plotlev, plotlab[1:-1]) elif np.isneginf(plotlab[0]) or np.isposinf(plotlab[-1]): raise ValueError("It's strange to set only side as infitive") else: if colorbarlabel != None: ftuple = (transformed_colorbarlabel_ticks, colorbarlabel) else: ftuple = (plotlev, plotlab) # data_transform==False else: if np.isneginf(plotlab[0]) and np.isposinf(plotlab[-1]): # if colorbarlabel is forced, then ticks and ticklabels will be forced. if colorbarlabel != None: ftuple = (colorbarlabel, colorbarlabel) # This by default will be done, it's maintained here only for clarity. else: ftuple = (plotlab[1:-1], plotlab[1:-1]) elif np.isneginf(plotlab[0]) or np.isposinf(plotlab[-1]): raise ValueError("It's strange to set only side as infitive") else: if colorbarlabel != None: ftuple = (colorbarlabel, colorbarlabel) else: ftuple = (plotlab, plotlab) ftuple = list(ftuple) if forcelabel != None: if len(forcelabel) != len(ftuple[1]): raise ValueError( """the length of the forcelabel and the length of labeled ticks is not equal!""" ) else: ftuple[1] = forcelabel return ftuple
def _diagnose(self): # Update log. self.logger.debug("diagnose: data: shape: " + str(self.data.shape)) self.logger.debug("diagnose: data: dtype: " + str(self.data.dtype)) self.logger.debug("diagnose: data: size: %.2fMB", self.data.nbytes * 9.53674e-7) self.logger.debug("diagnose: data: nans: " + str(np.sum(np.isnan(self.data)))) self.logger.debug("diagnose: data: -inf: " + str(np.sum(np.isneginf(self.data)))) self.logger.debug("diagnose: data: +inf: " + str(np.sum(np.isposinf(self.data)))) self.logger.debug("diagnose: data: positives: " + str(np.sum(self.data > 0))) self.logger.debug("diagnose: data: negatives: " + str(np.sum(self.data < 0))) self.logger.debug("diagnose: data: mean: " + str(np.mean(self.data))) self.logger.debug("diagnose: data: min: " + str(np.min(self.data))) self.logger.debug("diagnose: data: max: " + str(np.max(self.data))) self.logger.debug("diagnose: data_white: shape: " + str(self.data_white.shape)) self.logger.debug("diagnose: data_white: dtype: " + str(self.data_white.dtype)) self.logger.debug("diagnose: data_white: size: %.2fMB", self.data_white.nbytes * 9.53674e-7) self.logger.debug("diagnose: data_white: nans: " + str(np.sum(np.isnan(self.data_white)))) self.logger.debug("diagnose: data_white: -inf: " + str(np.sum(np.isneginf(self.data_white)))) self.logger.debug("diagnose: data_white: +inf: " + str(np.sum(np.isposinf(self.data_white)))) self.logger.debug("diagnose: data_white: positives: " + str(np.sum(self.data_white > 0))) self.logger.debug("diagnose: data_white: negatives: " + str(np.sum(self.data_white < 0))) self.logger.debug("diagnose: data_white: mean: " + str(np.mean(self.data_white))) self.logger.debug("diagnose: data_white: min: " + str(np.min(self.data_white))) self.logger.debug("diagnose: data_white: max: " + str(np.max(self.data_white))) self.logger.debug("diagnose: data_dark: shape: " + str(self.data_dark.shape)) self.logger.debug("diagnose: data_dark: dtype: " + str(self.data_dark.dtype)) self.logger.debug("diagnose: data_dark: size: %.2fMB", self.data_dark.nbytes * 9.53674e-7) self.logger.debug("diagnose: data_dark: nans: " + str(np.sum(np.isnan(self.data_dark)))) self.logger.debug("diagnose: data_dark: -inf: " + str(np.sum(np.isneginf(self.data_dark)))) self.logger.debug("diagnose: data_dark: +inf: " + str(np.sum(np.isposinf(self.data_dark)))) self.logger.debug("diagnose: data_dark: positives: " + str(np.sum(self.data_dark > 0))) self.logger.debug("diagnose: data_dark: negatives: " + str(np.sum(self.data_dark < 0))) self.logger.debug("diagnose: data_dark: mean: " + str(np.mean(self.data_dark))) self.logger.debug("diagnose: data_dark: min: " + str(np.min(self.data_dark))) self.logger.debug("diagnose: data_dark: max: " + str(np.max(self.data_dark))) self.logger.debug("diagnose: theta: shape: " + str(self.theta.shape)) self.logger.debug("diagnose: theta: dtype: " + str(self.theta.dtype)) self.logger.debug("diagnose: theta: size: %.2fMB", self.theta.nbytes * 9.53674e-7) self.logger.debug("diagnose: theta: nans: " + str(np.sum(np.isnan(self.theta)))) self.logger.debug("diagnose: theta: -inf: " + str(np.sum(np.isneginf(self.theta)))) self.logger.debug("diagnose: theta: +inf: " + str(np.sum(np.isposinf(self.theta)))) self.logger.debug("diagnose: theta: positives: " + str(np.sum(self.theta > 0))) self.logger.debug("diagnose: theta: negatives: " + str(np.sum(self.theta < 0))) self.logger.debug("diagnose: theta: mean: " + str(np.mean(self.theta))) self.logger.debug("diagnose: theta: min: " + str(np.min(self.theta))) self.logger.debug("diagnose: theta: max: " + str(np.max(self.theta))) self.logger.info("diagnose [ok]")
def test_neginf(self): arr =np.empty(100) arr[:] = -np.inf for np_func, acml_func in self.vector_funcs: np_out = np_func(arr) acml_out = acml_func(arr) equal_nan = np.isnan(np_out) == np.isnan(acml_out) equal_posinf = np.isposinf(np_out) == np.isposinf(acml_out) equal_neginf = np.isneginf(np_out) == np.isneginf(acml_out) self.assertTrue( np.alltrue(equal_nan), msg="NaN-test failed for %s" % acml_func) self.assertTrue( np.alltrue(equal_posinf), msg="posinf-test failed for %s" % acml_func) self.assertTrue( np.alltrue(equal_neginf), msg="neginf-test failed for %s" % acml_func)
def encode_fill_value(v, dtype): # early out if v is None: return v if dtype.kind == 'f': if np.isnan(v): return 'NaN' elif np.isposinf(v): return 'Infinity' elif np.isneginf(v): return '-Infinity' else: return float(v) elif dtype.kind in 'ui': return int(v) elif dtype.kind == 'b': return bool(v) elif dtype.kind in 'SV': v = base64.standard_b64encode(v) if not PY2: # pragma: py2 no cover v = str(v, 'ascii') return v elif dtype.kind == 'U': return v elif dtype.kind in 'mM': return int(v.view('u8')) else: return v
def set_logp_to_neg_inf(X, logp, bounds): """Set `logp` to negative infinity when `X` is outside the allowed bounds. # Arguments X: tensorflow.Tensor The variable to apply the bounds to logp: tensorflow.Tensor The log probability corrosponding to `X` bounds: list of `Region` objects The regions corrosponding to allowed regions of `X` # Returns logp: tensorflow.Tensor The newly bounded log probability """ conditions = [] for l, u in bounds: lower_is_neg_inf = not isinstance(l, tf.Tensor) and np.isneginf(l) upper_is_pos_inf = not isinstance(u, tf.Tensor) and np.isposinf(u) if not lower_is_neg_inf and upper_is_pos_inf: conditions.append(tf.greater(X, l)) elif lower_is_neg_inf and not upper_is_pos_inf: conditions.append(tf.less(X, u)) elif not (lower_is_neg_inf or upper_is_pos_inf): conditions.append(tf.logical_and(tf.greater(X, l), tf.less(X, u))) if len(conditions) > 0: is_inside_bounds = conditions[0] for condition in conditions[1:]: is_inside_bounds = tf.logical_or(is_inside_bounds, condition) logp = tf.select(is_inside_bounds, logp, tf.fill(tf.shape(X), config.dtype(-np.inf))) return logp
def calculate(self, g, level_number_density, lines_lower_level_index, lines_upper_level_index, metastability, lines): n_lower = level_number_density.values.take(lines_lower_level_index, axis=0, mode='raise') n_upper = level_number_density.values.take(lines_upper_level_index, axis=0, mode='raise') g_lower = self.get_g_lower(g, lines_lower_level_index) g_upper = self.get_g_upper(g, lines_upper_level_index) meta_stable_upper = self.get_metastable_upper(metastability, lines_upper_level_index) stimulated_emission_factor = ne.evaluate('1 - ((g_lower * n_upper) / ' '(g_upper * n_lower))') stimulated_emission_factor[n_lower == 0.0] = 0.0 stimulated_emission_factor[np.isneginf(stimulated_emission_factor)]\ = 0.0 stimulated_emission_factor[meta_stable_upper & (stimulated_emission_factor < 0)] = 0.0 if self.nlte_species: nlte_lines_mask = \ np.zeros(stimulated_emission_factor.shape[0]).astype(bool) for species in self.nlte_species: nlte_lines_mask |= (lines.atomic_number == species[0]) & \ (lines.ion_number == species[1]) stimulated_emission_factor[(stimulated_emission_factor < 0) & nlte_lines_mask[np.newaxis].T] = 0.0 return stimulated_emission_factor
def _update_parameters(self): """ Update parameters of the acquisition required to evaluate the function. In particular: * Sample representer points repr_points * Compute their log values repr_points_log * Compute belief locations logP """ self.repr_points, self.repr_points_log = self.sampler.get_samples(self.num_repr_points, self.proposal_function, self.burn_in_steps) if np.any(np.isnan(self.repr_points_log)) or np.any(np.isposinf(self.repr_points_log)): raise RuntimeError("Sampler generated representer points with invalid log values: {}".format(self.repr_points_log)) # Removing representer points that have 0 probability of being the minimum (corresponding to log probability being minus infinity) idx_to_remove = np.where(np.isneginf(self.repr_points_log))[0] if len(idx_to_remove) > 0: idx = list(set(range(self.num_repr_points)) - set(idx_to_remove)) self.repr_points = self.repr_points[idx, :] self.repr_points_log = self.repr_points_log[idx] # We predict with the noise as we need to make sure that var is indeed positive definite. mu, _ = self.model.predict(self.repr_points) # we need a vector mu = np.ndarray.flatten(mu) var = self.model.predict_covariance(self.repr_points) self.logP, self.dlogPdMu, self.dlogPdSigma, self.dlogPdMudMu = epmgp.joint_min(mu, var, with_derivatives=True) # add a second dimension to the array self.logP = np.reshape(self.logP, (self.logP.shape[0], 1))
def convert_to_log(self, img): log_img = np.ones(img.shape, np.float32) with np.errstate(divide='ignore'): log_img = np.log(img, log_img) log_img[np.isneginf(log_img)] = 0 return np.nan_to_num(log_img)
def prior_probabilities ( self, theta ): """ The method that calculates the prior (log) probabilities. This is based on the prior distributions given in prior_distributions, and assumes independence, so we just add them up. """ p = numpy.array([ numpy.log ( getattr ( self, self.parameters[i]).pdf ( theta[i])) for i in xrange(len(self.parameters)) ]).sum() if numpy.isneginf(p): p = numpy.log(1.0E-300) return p
def Draw(self, args=None): """Draw the various functions""" if not args or "SAME" not in args: # make a 'blank' function to occupy the complete range of x values: lower_lim = min([lim[0] for lim in self.functions_dict.keys()]) if np.isneginf(lower_lim): lower_lim = -999 upper_lim = max([lim[1] for lim in self.functions_dict.keys()]) if np.isposinf(upper_lim): upper_lim = 999 blank = ROOT.TF1("blank" + str(np.random.randint(0, 10000)), "1.5", lower_lim, upper_lim) blank.Draw() max_value = max([func.GetMaximum(lim[0], lim[1]) for lim, func in self.functions_dict.iteritems()]) * 1.1 blank.SetMaximum(max_value) min_value = min([func.GetMinimum(lim[0], lim[1]) for lim, func in self.functions_dict.iteritems()]) * 0.9 blank.SetMinimum(min_value) ROOT.SetOwnership(blank, False) # NEED THIS SO IT ACTUALLY GETS DRAWN. SERIOUSLY, WTF?! blank.SetLineColor(ROOT.kWhite) # now draw the rest of the functions args = "" if not args else args for func in self.functions_dict.values(): func.Draw("SAME" + args)
def gradient_desc_ridge(X, Y, W, alpha, lambd, num_iter=1000, conv_tol=0.01, check_interval=500): c = float("inf") log("Learn Rate", alpha) for i in range(num_iter): # # delta = 2/N SIGMA[(XW - Y)*x] + 2 * \lambd * W diff = predict(X, W) - Y delta = np.sum(np.multiply(X, diff), axis=0) # sum top to bottom for each attribute delta = delta * 2.0 / len(Y) delta = np.array([delta]).transpose() # restore vector shape of (n_attr x 1) delta = delta + (2 * lambd * W) # Vectors addition W = W - alpha * delta if i % check_interval == 0: predY = predict(X, W) newcost = MSECost(predY, Y) log("#%d, cost = %.8g" % (i, newcost)) if np.isnan(newcost) or np.isinf(newcost) or np.isneginf(newcost): raise Exception("ERROR: number overflow, please adjust learning rate") diff = abs(newcost - c) c = newcost if diff < conv_tol: log("Converged with tolerance %f " % conv_tol) break if not quiet and i % (check_interval * 10) == 0: print(W.flatten()) return W
def calculate(self, g, level_number_density, lines_lower_level_index, lines_upper_level_index, metastability, lines): n_lower = level_number_density.values.take(lines_lower_level_index, axis=0, mode='raise') n_upper = level_number_density.values.take(lines_upper_level_index, axis=0, mode='raise') g_lower = self.get_g_lower(g, lines_lower_level_index) g_upper = self.get_g_upper(g, lines_upper_level_index) meta_stable_upper = self.get_metastable_upper(metastability, lines_upper_level_index) stimulated_emission_factor = ne.evaluate('1 - ((g_lower * n_upper) / ' '(g_upper * n_lower))') stimulated_emission_factor[n_lower == 0.0] = 0.0 stimulated_emission_factor[np.isneginf(stimulated_emission_factor)]\ = 0.0 stimulated_emission_factor[meta_stable_upper & (stimulated_emission_factor < 0)] = 0.0 if self.nlte_species: nlte_lines_mask = lines.reset_index().apply( lambda row: (row.atomic_number, row.ion_number) in self.nlte_species, axis=1 ).values stimulated_emission_factor[(stimulated_emission_factor < 0) & nlte_lines_mask[np.newaxis].T] = 0.0 return stimulated_emission_factor
def takeRatio(num,den): toReturn = num.copy() toReturn['data'] = np.log(num['data']/den['data']) whereBad = np.isnan(toReturn['data']) | np.isinf(toReturn['data']) | np.isneginf(toReturn['data']) toReturn['data'][whereBad] = 0.0 return toReturn
def traverse_data(datum, is_numpy=is_numpy, use_numpy=True): """recursively dig until a flat list is found if numpy is available convert the flat list to a numpy array and send off to transform_array() to handle nan, inf, -inf otherwise iterate through items in array converting non-json items Args: datum (list) : a list of values or lists is_numpy: True if numpy is present (see imports) use_numpy: toggle numpy as a dependency for testing purposes """ is_numpy = is_numpy and use_numpy if is_numpy and not any(isinstance(el, (list, tuple)) for el in datum): return transform_array(np.asarray(datum)) datum_copy = [] for item in datum: if isinstance(item, (list, tuple)): datum_copy.append(traverse_data(item)) elif isinstance(item, float): if np.isnan(item): item = 'NaN' elif np.isposinf(item): item = 'Infinity' elif np.isneginf(item): item = '-Infinity' datum_copy.append(item) else: datum_copy.append(item) return datum_copy
def likelihood_function ( self, theta ): """For example! This function ought to be overridden by the user, and maybe extended with whatever extra parameters you need to get hold of your observations, or model driver parameters. This function method calculates the likelihood function for a vector M{\theta}. Usually, you have a model you run with these parameters as inputs (+ some driver data), and some observations that go with the output of the forward model output. These two sets of values are combined in some sort of cost function/likelihood function. A common criterion is to assume that the model is able to perfectly replicate the observations (given a proper parametrisation). The only mismatch between model output and observations is then given by the uncertainty with which the measurement is performed, and we can encode this as a zero-mean Normal distribution. The variance of this distribution is then related to the observational error. If different measurements are used, a multivariate normal is useful, and correlation between observations can also be included, if needs be. """ means = numpy.matrix([-3.0, 2.8]) means = numpy.matrix([-5.0, 5]) sigma1 = 1.0 sigma2 = 2#0.5 rho = -0.5#-0.1 covar = numpy.matrix([[sigma1*sigma1,rho*sigma1*sigma2],[rho*sigma1*sigma2,sigma2*sigma2]]) inv_covar = numpy.linalg.inv ( covar ) # numpy.matrix([[ 5.26315789, 9.47368421],\ #[ 9.47368421, 21.05263158]]) det_covar = numpy.linalg.det( covar ) #0.047499999999999987 N = means.shape[0] X = numpy.matrix(means- theta) #X = theta #p = full_gauss_den(X, means, covar, True) #This is just lazy... Using libraries to invert a 2x2 matrix & calc. its determinant.... #Also, the log calculations could be done more efficiently and stored, but... p = pow(1.0/(2*numpy.pi), N/2.) p = p / numpy.sqrt ( numpy.linalg.det (covar)) #p = 0.73025296137109341 # Precalc'ed #p = p * numpy.exp (-0.5*X*inv_covar*X.transpose()) a = X*inv_covar*X.T p = p*numpy.exp(-0.5*a) #pdb.set_trace() p = numpy.log(p) if numpy.isneginf(p): p = numpy.log(1.0E-300) return p
def map_to_range(v, oi, oa, ni, na): if numpy.isinf(v): return na elif numpy.isneginf(v): return ni else: return(((v - oi) * (na - ni)) / (oa - oi)) + ni
def optimize_A(self, A): """Find optimal transformation matrix A by minimization. Parameters ---------- A : ndarray The transformation matrix A. Returns ------- A : ndarray The transformation matrix. """ flat_map, square_map = get_maps(A) alpha = to_flat(1.0 * A, flat_map) obj = lambda x: -1 * self.objective_function(x, self.T, self.right_eigenvectors, square_map, self.populations) self.obj = obj self.alpha = alpha.copy() logger.info("Initial value of objective function: f = %f", obj(alpha)) alpha = scipy.optimize.anneal(obj, alpha, lower=0.0, maxiter=1, schedule="boltzmann", dwell=1000, feps=1E-3, boltzmann=2.0, T0=1.0)[0] alpha = scipy.optimize.fmin(obj, alpha, full_output=True, xtol=1E-4, ftol=1E-4, maxfun=5000, maxiter=100000)[0] logger.info("Final value: f = %f" % (obj(alpha))) if np.isneginf(obj(alpha)): raise(ValueError("Error: minimization has not located a feasible point.")) A = to_square(alpha, square_map) return A
def filt_butter(data, samp_freq, butter_freq, axis=-1): ''' Filter data with a 2nd order butterworth filter. Parameters ========== data: ndarray samp_freq: sampling period (s) butter_freq: [cutoff_low, cutoff_high] (Hz), can be infinite axis (optional): axis along which to filter, default = -1 Returns ======= filtNs: filtered version of data ''' order = 2 ny = 0.5 / samp_freq # Nyquist frequency cof = butter_freq / ny # normalized cutoff freq if np.isneginf(cof[0]) and np.isfinite(cof[1]): # lowpass cof1 = cof[1] b, a = scipy.signal.butter(order, cof1, btype='low') filtNs = scipy.signal.filtfilt(b, a, data, axis=axis) elif np.isfinite(cof[0]) and np.isinf(cof[1]): # highpass cof1 = cof[0] b, a = scipy.signal.butter(order, cof1, btype='high') filtNs = scipy.signal.filtfilt(b, a, data, axis=axis) elif np.isfinite(cof[0]) and np.isfinite(cof[1]): # bandpass b, a = scipy.signal.butter(order, cof, btype='band') filtNs = scipy.signal.filtfilt(b, a, data, axis=axis) else: raise Exception('filt_butter called with bad cutoff frequency') filtNs /= samp_freq # normalize to rate return filtNs
def merciless_print(i, node, fn): """Debugging theano. Prints inputs and outputs at every point. In case NaN, Inf or -Inf is detected, fires up the pdb debugger.""" print '' print '-------------------------------------------------------' print 'Node %s' % str(i) theano.printing.debugprint(node) print 'Inputs : %s' % [input for input in fn.inputs] print 'Outputs: %s' % [output for output in fn.outputs] print 'Node:' for output in fn.outputs: try: if numpy.isnan(output[0]).any(): print '*** NaN detected ***' theano.printing.debugprint(node) print 'Inputs : %s' % [input[0] for input in fn.inputs] print 'Outputs: %s' % [output[0] for output in fn.outputs] pdb.set_trace() raise ValueError('Found NaN in computation!') if numpy.isposinf(output[0]).any() or numpy.isneginf(output[0]).any(): print '*** Inf detected ***' theano.printing.debugprint(node) print 'Inputs : %s' % [input[0] for input in fn.inputs] print 'Outputs: %s' % [output[0] for output in fn.outputs] pdb.set_trace() raise ValueError('Found Inf in computation!') except TypeError: logging.debug('Couldn\'t check node for NaN/Inf: {0}'.format(node))
def traverse_data(obj, is_numpy=is_numpy, use_numpy=True): """ Recursively traverse an object until a flat list is found. If NumPy is available, the flat list is converted to a numpy array and passed to transform_array() to handle ``nan``, ``inf``, and ``-inf``. Otherwise, iterate through all items, converting non-JSON items Args: obj (list) : a list of values or lists is_numpy (bool, optional): Whether NumPy is availanble (default: True if NumPy is importable) use_numpy (bool, optional) toggle NumPy as a dependency for testing This argument is only useful for testing (default: True) """ is_numpy = is_numpy and use_numpy if is_numpy and all(isinstance(el, np.ndarray) for el in obj): return [transform_array(el) for el in obj] obj_copy = [] for item in obj: if isinstance(item, (list, tuple)): obj_copy.append(traverse_data(item)) elif isinstance(item, float): if np.isnan(item): item = 'NaN' elif np.isposinf(item): item = 'Infinity' elif np.isneginf(item): item = '-Infinity' obj_copy.append(item) else: obj_copy.append(item) return obj_copy
def msr2k(rvnames, rvs, trunclb, truncub, G): # robustnes klb = trunclb[0]; kub=truncub[0]; # reliability corr = np.eye(len(rvnames)) probdata = ProbData(names=rvnames, rvs=rvs, corr=corr, nataf=False) analysisopt = AnalysisOpt(gradflag='DDM', recordu=False, recordx=False, flagsens=False, verbose=False) # limit state 1 def gf1(x, param=None): m, C, Sre, Na = x K = C*(Sre**m)*(G**m)*(np.pi**(m/2.))*Na return K-kub def dgdq1(x, param=None): m, C, Sre, Na = x Srem = Sre**m; Gm = G**m; pim2 = np.pi**(m/2.) dgdm = C*np.log(Sre)*Srem*Gm*pim2*Na+C*Srem*np.log(G)*Gm*pim2*Na+\ C*Srem*Gm*np.log(np.pi)*pim2*0.5*Na dgdC = Srem*Gm*pim2*Na dgdSre = C*m*(Sre**(m-1.))*Gm*pim2*Na dgdNa = C*Srem*Gm*pim2 return [dgdm, dgdC, dgdSre, dgdNa] gfunc1 = Gfunc(gf1, dgdq1) formBeta1 = CompReliab(probdata, gfunc1, analysisopt) # limit state 2 def gf2(x, param=None): m, C, Sre, Na = x K = C*(Sre**m)*(G**m)*(np.pi**(m/2))*Na return klb-K def dgdq2(x, param=None): m, C, Sre, Na = x Srem = Sre**m; Gm = G**m; pim2 = np.pi**(m/2) dgdm = C*np.log(Sre)*Srem*Gm*pim2*Na+C*Srem*np.log(G)*Gm*pim2*Na+\ C*Srem*Gm*np.log(np.pi)*pim2*0.5*Na dgdC = Srem*Gm*pim2*Na dgdSre = C*m*(Sre**(m-1.))*Gm*pim2*Na dgdNa = C*Srem*Gm*pim2 return [-dgdm, -dgdC, -dgdSre, -dgdNa] gfunc2 = Gfunc(gf2, dgdq2) formBeta2 = CompReliab(probdata, gfunc2, analysisopt) # system reliability try: if np.isneginf(klb): formresults = formBeta1.form_result() pf = formresults.pf1 elif np.isposinf(kub): formresults = formBeta2.form_result() pf = formresults.pf1 else: sysBeta = SysReliab([formBeta1, formBeta2], [2]) sysformres = sysBeta.mvn_msr(sysBeta.syscorr) pf = sysformres.pf # formresults = formBeta2.form_result() # pf = formresults.pf1 except np.linalg.LinAlgError: pf = 0. return pf
def to_standard_form(self,): """ Return an instance of StandardLP by factoring this problem. """ A = self.A.tocsc(copy=True) b = self.b.copy() c = self.c.copy() r = self.r.copy() l = self.l.copy() u = self.u.copy() f = self.f # abort if lower bound equals -Infinity if np.isneginf(self.l).any(): raise ValueError('Lower bounds (l) contains -inf.') # shift lower bounds to zero (x <- x-l) so that new problem # has the following form # # optimize c^Tx + c^Tl # # s.t. b-Al <= Ax <= b-Al+r # 0 <= x <= u-l # indices where u is not +inf ind = np.where(np.isposinf(u)==False)[0] u[ind] -= l[ind] b = b - A.dot(l) f += np.dot(c,l) # Convert equality constraints to a pair of inequalities A = vstack([A,A]) # Double A matrix b = np.r_[b,b] b[:self.m] *= -1 b[self.m:] += r # add upper bounds nubs = len(ind) Aubs = coo_matrix((np.ones(nubs), (np.arange(nubs,ind)))) b = np.r_[b,u[ind]] A = vstack([A,Aubs]) # Now lp has the following form, # # maximize c^Tx + c^Tl # # s.t. -Ax <= -b # Ax <= b+r-l # x <= u-l # x >= 0 assert A.shape[0] == b.shape[0] lp = StandardLP(A,b,c,f=f) return lp
def replace_neginf(array): temp = array minval = (array[np.where(np.isfinite(array))[0]]).min() temp[np.where(np.isneginf(temp))[0]] = minval - 1e-300 return temp
def _scobit_utility_transform(systematic_utilities, alt_IDs, rows_to_alts, shape_params, intercept_params, intercept_ref_pos=None, *args, **kwargs): """ Parameters ---------- systematic_utilities : 1D ndarray. All elements should be ints, floats, or longs. Should contain the systematic utilities of each observation per available alternative. Note that this vector is formed by the dot product of the design matrix with the vector of utility coefficients. alt_IDs : 1D ndarray. All elements should be ints. There should be one row per obervation per available alternative for the given observation. Elements denote the alternative corresponding to the given row of the design matrix. rows_to_alts : 2D scipy sparse matrix. There should be one row per observation per available alternative and one column per possible alternative. This matrix maps the rows of the design matrix to the possible alternatives for this dataset. All elements should be zeros or ones. shape_params : None or 1D ndarray. If an array, each element should be an int, float, or long. There should be one value per shape parameter of the model being used. intercept_params : None or 1D ndarray. If an array, each element should be an int, float, or long. If J is the total number of possible alternatives for the dataset being modeled, there should be J-1 elements in the array. intercept_ref_pos : int, or None, optional. Specifies the index of the alternative, in the ordered array of unique alternatives, that is not having its intercept parameter estimated (in order to ensure identifiability). Should only be None if `intercept_params` is None. Returns ------- transformations : 2D ndarray. Should have shape `(systematic_utilities.shape[0], 1)`. The returned array contains the transformed utility values for this model. All elements should be ints, floats, or longs. """ # Figure out what indices are to be filled in if intercept_ref_pos is not None and intercept_params is not None: needed_idxs = range(intercept_params.shape[0] + 1) needed_idxs.remove(intercept_ref_pos) if len(intercept_params.shape) > 1 and intercept_params.shape[1] > 1: # Get an array of zeros with shape # (num_possible_alternatives, num_parameter_samples) all_intercepts = np.zeros( (rows_to_alts.shape[1], intercept_params.shape[1])) # For alternatives having their intercept estimated, replace the # zeros with the current value of the estimated intercepts all_intercepts[needed_idxs, :] = intercept_params else: # Get an array of zeros with shape (num_possible_alternatives,) all_intercepts = np.zeros(rows_to_alts.shape[1]) # For alternatives having their intercept estimated, replace the # zeros with the current value of the estimated intercepts all_intercepts[needed_idxs] = intercept_params else: # Create a full set of intercept parameters including the intercept # constrained to zero all_intercepts = np.zeros(rows_to_alts.shape[1]) # Figure out what intercept values correspond to each row of the # systematic utilities long_intercepts = rows_to_alts.dot(all_intercepts) # Convert the shape parameters back into their 'natural parametrization' natural_shapes = np.exp(shape_params) natural_shapes[np.isposinf(natural_shapes)] = max_comp_value # Figure out what shape values correspond to each row of the # systematic utilities long_natural_shapes = rows_to_alts.dot(natural_shapes) # Calculate the data dependent part of the transformation # Also, along the way, guard against numeric underflow or overflow exp_neg_v = np.exp(-1 * systematic_utilities) exp_neg_v[np.isposinf(exp_neg_v)] = max_comp_value powered_term = np.power(1 + exp_neg_v, long_natural_shapes) powered_term[np.isposinf(powered_term)] = max_comp_value term_2 = np.log(powered_term - 1) # Guard against overvlow too_big_idx = np.isposinf(powered_term) term_2[too_big_idx] = (-1 * long_natural_shapes[too_big_idx] * systematic_utilities[too_big_idx]) transformations = long_intercepts - term_2 # Guard against overflow transformations[np.isposinf(transformations)] = max_comp_value transformations[np.isneginf(transformations)] = -1 * max_comp_value # Be sure to return a 2D array since other functions will be expecting that if len(transformations.shape) == 1: transformations = transformations[:, np.newaxis] return transformations
def __getitem__(self, idx): """Generate one batch of data""" # Initialization X = np.empty([ self.batch_size * 4, self.dim[0], self.dim[1], self.dim[2], self.dim[3] ]) Y = np.empty([self.batch_size * 4, self.num_out]) batch = self.list_IDs[idx * self.batch_size:(idx + 1) * self.batch_size] # Generate data c = 0 for i, ID in enumerate(batch): # Load input and output raw_vol_in = np.array( sio.loadmat(self.in_folder[ID]).get("bModes")) raw_vol_in[np.isneginf(raw_vol_in)] = -151 raw_vol_in = np.nan_to_num(raw_vol_in) tmp_vol_out = np.array( sio.loadmat(self.out_folder[ID]).get('regVars')) tmp_vol_out[tmp_vol_out < 1e-6] = 0 print(tmp_vol_out[0]) # tmp_vol_out = np.nan_to_num(tmp_vol_out) tmp_vol_in = np.empty([self.dim[0], self.dim[1], self.dim[2]]) for j in range( self.dim[2]): # Extract input image in dim(128,128,99) tmp_vol_in[:, :, j] = raw_vol_in[:, j * self.dim[0]:self.dim[ 1] + j * self.dim[1]] # selects all rows and shifts with # Call the data augmentation function Vols = AugTrain_reg(tmp_vol_in, tmp_vol_out, self.num_out, self.minmax) # X_aug = Vols[0] # Y_aug = Vols[1] X[i * c, ] = Vols[0][0] # original and augmented images in X X[i * c + 1, ] = Vols[0][1] X[i * c + 2, ] = Vols[0][2] X[i * c + 3, ] = Vols[0][3] Y[i * c, ] = Vols[1][0] Y[i * c + 1, ] = Vols[1][1] Y[i * c + 2, ] = Vols[1][2] Y[i * c + 3, ] = Vols[1][3] c = c + 4 #print(Y) print("Shape in datagentrain: " + str(type(X[1][1][1][45][0]))) X = np.moveaxis(X, -2, 1) print(X.shape) print(Y.shape) return X, Y
def testKernelResultsUsingTruncatedDistribution(self): def log_prob(x): return tf.where( x >= 0., -x - x**2, # Non-constant gradient. tf.fill(x.shape, tf.cast(-np.inf, x.dtype))) # This log_prob has the property that it is likely to attract # the flow toward, and below, zero...but for x <=0, # log_prob(x) = -inf, which should result in rejection, as well # as a non-finite log_prob. Thus, this distribution gives us an opportunity # to test out the kernel results ability to correctly capture rejections due # to finite AND non-finite reasons. # Why use a non-constant gradient? This ensures the leapfrog integrator # will not be exact. num_results = 1000 # Large step size, will give rejections due to integration error in addition # to rejection due to going into a region of log_prob = -inf. step_size = 0.2 num_leapfrog_steps = 5 num_chains = 2 # Start multiple independent chains. initial_state = tf.convert_to_tensor([0.1] * num_chains) states, kernel_results = tfp.mcmc.sample_chain( num_results=num_results, current_state=initial_state, kernel=tfp.mcmc.HamiltonianMonteCarlo( target_log_prob_fn=log_prob, step_size=step_size, num_leapfrog_steps=num_leapfrog_steps, seed=_set_seed(42)), parallel_iterations=1) states_, kernel_results_ = self.evaluate([states, kernel_results]) pstates_ = kernel_results_.proposed_state neg_inf_mask = np.isneginf( kernel_results_.proposed_results.target_log_prob) # First: Test that the mathematical properties of the above log prob # function in conjunction with HMC show up as expected in kernel_results_. # We better have log_prob = -inf some of the time. self.assertLess(0, neg_inf_mask.sum()) # We better have some rejections due to something other than -inf. self.assertLess(neg_inf_mask.sum(), (~kernel_results_.is_accepted).sum()) # We better have accepted a decent amount, even near end of the chain. self.assertLess( 0.1, kernel_results_.is_accepted[int(0.9 * num_results):].mean()) # We better not have any NaNs in states or log_prob. # We may have some NaN in grads, which involve multiplication/addition due # to gradient rules. This is the known "NaN grad issue with tf.where." self.assertAllEqual( np.zeros_like(states_), np.isnan(kernel_results_.proposed_results.target_log_prob)) self.assertAllEqual( np.zeros_like(states_), np.isnan(states_)) # We better not have any +inf in states, grads, or log_prob. self.assertAllEqual( np.zeros_like(states_), np.isposinf(kernel_results_.proposed_results.target_log_prob)) self.assertAllEqual( np.zeros_like(states_), np.isposinf( kernel_results_.proposed_results.grads_target_log_prob[0])) self.assertAllEqual(np.zeros_like(states_), np.isposinf(states_)) # Second: Test that kernel_results is congruent with itself and # acceptance/rejection of states. # Proposed state is negative iff proposed target log prob is -inf. np.testing.assert_array_less(pstates_[neg_inf_mask], 0.) np.testing.assert_array_less(0., pstates_[~neg_inf_mask]) # Acceptance probs are zero whenever proposed state is negative. acceptance_probs = np.exp(np.minimum( kernel_results_.log_accept_ratio, 0.)) self.assertAllEqual( np.zeros_like(pstates_[neg_inf_mask]), acceptance_probs[neg_inf_mask]) # The move is accepted ==> state = proposed state. self.assertAllEqual( states_[kernel_results_.is_accepted], pstates_[kernel_results_.is_accepted], ) # The move was rejected <==> state[t] == state[t - 1]. for t in range(1, num_results): for i in range(num_chains): if kernel_results_.is_accepted[t, i]: self.assertNotEqual(states_[t, i], states_[t - 1, i]) else: self.assertEqual(states_[t, i], states_[t - 1, i])
def _score_text(input_file, vocabulary, scorer, output_file, log_base=None, subword_marking=None, word_level=False): """Reads text from ``input_file``, computes perplexity using ``scorer``, and writes to ``output_file``. :type input_file: file object :param input_file: a file that contains the input sentences in SRILM n-best format :type vocabulary: Vocabulary :param vocabulary: vocabulary that provides mapping between words and word IDs :type scorer: TextScorer :param scorer: a text scorer for rescoring the input sentences :type output_file: file object :param output_file: a file where to write the output n-best list in SRILM format :type log_base: int :param log_base: if set to other than None, convert log probabilities to this base :type subword_marking: str :param subword_marking: if other than None, vocabulary is subwords; "word-boundary" indicates <w> token separates words, "prefix-affix" indicates subwords are prefixed/affixed with + :type word_level: bool :param word_level: if set to True, also writes word-level statistics """ scoring_iter = \ ScoringBatchIterator(input_file, vocabulary, batch_size=16, max_sequence_length=None, map_oos_to_unk=False) log_scale = 1.0 if log_base is None else numpy.log(log_base) total_logprob = 0.0 num_sentences = 0 num_tokens = 0 num_words = 0 num_probs = 0 num_unks = 0 num_zeroprobs = 0 for word_ids, words, mask in scoring_iter: class_ids, membership_probs = vocabulary.get_class_memberships( word_ids) logprobs = scorer.score_batch(word_ids, class_ids, membership_probs, mask) for seq_index, seq_logprobs in enumerate(logprobs): seq_word_ids = word_ids[:, seq_index] seq_mask = mask[:, seq_index] seq_word_ids = seq_word_ids[seq_mask == 1] seq_words = words[seq_index] merged_words, merged_logprobs = _merge_subwords( seq_words, seq_logprobs, subword_marking) # total logprob of this sequence seq_logprob = sum(lp for lp in merged_logprobs if (lp is not None) and (not numpy.isneginf(lp))) # total logprob of all sequences total_logprob += seq_logprob # number of tokens, which may be subwords, including <unk>'s num_tokens += len(seq_word_ids) # number of words, including <s>'s and <unk>'s num_words += len(merged_words) # number of word probabilities computed (may not include <unk>'s) num_seq_probs = sum((lp is not None) and (not numpy.isneginf(lp)) for lp in merged_logprobs) num_probs += num_seq_probs # number of unks and zeroprobs (just for reporting) num_unks += sum(lp is None for lp in merged_logprobs) num_zeroprobs += sum((lp is not None) and numpy.isneginf(lp) for lp in merged_logprobs) # number of sequences num_sentences += 1 if word_level: output_file.write("# Sentence {0}\n".format(num_sentences)) _write_word_scores(vocabulary, merged_words, merged_logprobs, output_file, log_scale) output_file.write("Sentence perplexity: {0}\n\n".format( numpy.exp(-seq_logprob / num_seq_probs))) output_file.write("Number of sentences: {0}\n".format(num_sentences)) output_file.write("Number of words: {0}\n".format(num_words)) output_file.write("Number of tokens: {0}\n".format(num_tokens)) output_file.write( "Number of predicted probabilities: {0}\n".format(num_probs)) output_file.write("Number of excluded (OOV) words: {0}\n".format(num_unks)) output_file.write( "Number of zero probabilities: {0}\n".format(num_zeroprobs)) if num_words > 0: cross_entropy = -total_logprob / num_probs perplexity = numpy.exp(cross_entropy) output_file.write( "Cross entropy (base e): {0}\n".format(cross_entropy)) if log_base is not None: cross_entropy /= log_scale output_file.write("Cross entropy (base {1}): {0}\n".format( cross_entropy, log_base)) output_file.write("Perplexity: {0}\n".format(perplexity))
def find_cdf_limits(q, f, a, b, args=(), exponent=1.1, maxiter=100, return_iterations=False): """find arguments xl, xu of cdf f such that f(xl)<=q & f(xu)>=1-q""" # f is assumed to be a monoton. incr. function from (a,b) to [0, 1] # x has various ranges # y has range [0, 1] # g maps from y to x # g_inv maps from x to y # map from [0, 1] to the actual domain of f if np.isneginf(a) and np.isposinf(b): gs = [lambda y: np.log(y / (1. - y)), lambda y: np.log((1. - y) / y)] elif np.isneginf(a): gs = [lambda y: (y - 1.) / y + b, lambda y: y / (1. - y) + b] elif np.isposinf(b): gs = [lambda y: y / (1. - y) + a, lambda y: (1. - y) / y + a] else: gs = [lambda y: y * (b - a) + a, lambda y: (1. - y) * (b - a) + a] # limit_type 0/1 is lower/upper limit for limit_type in range(2): g = gs[limit_type] for i_n, n in enumerate(range(1, maxiter)): y = 2**(-n**exponent) if i_n == 0: fval = np.array(f(g(y), *args)) limit = np.array(g(y) * np.ones_like(fval)) if limit_type == 0: bad = np.array((fval > q)) else: bad = np.array((fval < 1 - q)) else: sh = [np.array(_)[bad] for _ in args] fval[bad] = f(g(y), *sh) limit[bad] = g(y) if limit_type == 0: bad[bad] = (fval[bad] > q) else: bad[bad] = (fval[bad] < 1 - q) nbad = np.sum(bad) if nbad == 0 or y == 0: break if limit_type == 0: lower_limit = limit n_lower_limit = n if nbad > 0: warnings.warn( 'Maximum number of iterations ({}) exceeded ' 'while determining lower limit.'.format(maxiter), AccuracyWarning) else: upper_limit = limit n_upper_limit = n if nbad > 0: warnings.warn( 'Maximum number of iterations ({}) exceeded ' 'while determining upper limit.'.format(maxiter), AccuracyWarning) if return_iterations: return lower_limit, upper_limit, n_lower_limit, n_upper_limit else: return lower_limit, upper_limit
def false_map_borders_cir(self): """ Creates map of FP/FNs overlaid on CIR image with cloud borders """ plt.ioff() for img in self.img_list: img_path = data_path / 'images' / img stack_path = img_path / 'stack' / 'stack.tif' plot_path = data_path / self.batch / 'plots' / img band_combo_dir = data_path / 'band_combos' try: plot_path.mkdir(parents=True) except FileExistsError: pass with rasterio.open(str(stack_path), 'r') as ds: data = ds.read() data = data.transpose( (1, -1, 0) ) # Not sure why the rasterio.read output is originally (D, W, H) data[data == -999999] = np.nan data[np.isneginf(data)] = np.nan # Get flooded image (remove perm water) flood_index = data.shape[2] - 1 perm_index = data.shape[2] - 2 indices = np.where((data[:, :, flood_index] == 1) & (data[:, :, perm_index] == 1)) rows, cols = zip(indices) true_flood = data[:, :, flood_index] true_flood[rows, cols] = 0 # Now convert to a gray color image true_flood_rgb = np.zeros( (true_flood.shape[0], true_flood.shape[1], 4), 'uint8') true_flood_rgb[:, :, 0] = true_flood * 174 true_flood_rgb[:, :, 1] = true_flood * 236 true_flood_rgb[:, :, 2] = true_flood * 238 true_flood_rgb[:, :, 3] = true_flood * 255 # Make non-flood pixels transparent indices = np.where((true_flood_rgb[:, :, 0] == 0) & (true_flood_rgb[:, :, 1] == 0) & (true_flood_rgb[:, :, 2] == 0) & (true_flood_rgb[:, :, 3] == 0)) true_flood_rgb[indices] = 0 true_flood_rgb = Image.fromarray(true_flood_rgb, mode='RGBA') for pctl in self.pctls: # Get CIR image cir_file = band_combo_dir / '{}'.format(img + '_cir_img' + '.png') cir_img = Image.open(cir_file) # Get FP/FN image comparison_img_file = plot_path / '{}'.format('false_map' + str(pctl) + '.png') flood_overlay = Image.open(comparison_img_file) flood_overlay_arr = np.array(flood_overlay) indices = np.where((flood_overlay_arr[:, :, 0] == 0) & (flood_overlay_arr[:, :, 1] == 0) & (flood_overlay_arr[:, :, 2] == 0) & (flood_overlay_arr[:, :, 3] == 255)) flood_overlay_arr[indices] = 0 # Change red to lime green red_indices = np.where((flood_overlay_arr[:, :, 0] == 255) & (flood_overlay_arr[:, :, 1] == 0) & (flood_overlay_arr[:, :, 2] == 0) & (flood_overlay_arr[:, :, 3] == 255)) flood_overlay_arr[red_indices] = [0, 255, 64, 255] flood_overlay = Image.fromarray(flood_overlay_arr, mode='RGBA') # Create cloud border image clouds_dir = data_path / 'clouds' clouds = np.load(clouds_dir / '{0}'.format(img + '_clouds.npy')) clouds[np.isnan(data[:, :, 0])] = np.nan cloudmask = np.less(clouds, np.nanpercentile(clouds, pctl), where=~np.isnan(clouds)) from scipy.ndimage import binary_dilation, binary_erosion cloudmask_binary = cloudmask.astype('int') cloudmask_border = binary_dilation(cloudmask_binary, iterations=3) cloudmask_border = (cloudmask_border - cloudmask_binary) # Convert border to yellow border = np.zeros( (cloudmask_border.shape[0], cloudmask_border.shape[1], 4), 'uint8') border[:, :, 0] = cloudmask_border * 255 border[:, :, 1] = cloudmask_border * 255 border[:, :, 2] = cloudmask_border * 0 border[:, :, 3] = cloudmask_border * 255 # Make non-border pixels transparent indices = np.where((border[:, :, 0] == 0) & (border[:, :, 1] == 0) & (border[:, :, 2] == 0) & (border[:, :, 3] == 0)) border[indices] = 0 border_rgb = Image.fromarray(border, mode='RGBA') # Plot all layers together cir_img.paste(true_flood_rgb, (0, 0), true_flood_rgb) cir_img.paste(flood_overlay, (0, 0), flood_overlay) cir_img.paste(border_rgb, (0, 0), border_rgb) cir_img.save( plot_path / '{}'.format('false_map_border_cir' + str(pctl) + '.png'), dpi=(300, 300))
def numeric_summary(tensor): """Get a text summary of a numeric tensor. This summary is only available for numeric (int*, float*, complex*) and Boolean tensors. Args: tensor: (`numpy.ndarray`) the tensor value object to be summarized. Returns: The summary text as a `RichTextLines` object. If the type of `tensor` is not numeric or Boolean, a single-line `RichTextLines` object containing a warning message will reflect that. """ def _counts_summary(counts, skip_zeros=True, total_count=None): """Format values as a two-row table.""" if skip_zeros: counts = [(count_key, count_val) for count_key, count_val in counts if count_val] max_common_len = 0 for count_key, count_val in counts: count_val_str = str(count_val) common_len = max(len(count_key) + 1, len(count_val_str) + 1) max_common_len = max(common_len, max_common_len) key_line = debugger_cli_common.RichLine("|") val_line = debugger_cli_common.RichLine("|") for count_key, count_val in counts: count_val_str = str(count_val) key_line += _pad_string_to_length(count_key, max_common_len) val_line += _pad_string_to_length(count_val_str, max_common_len) key_line += " |" val_line += " |" if total_count is not None: total_key_str = "total" total_val_str = str(total_count) max_common_len = max(len(total_key_str) + 1, len(total_val_str)) total_key_str = _pad_string_to_length(total_key_str, max_common_len) total_val_str = _pad_string_to_length(total_val_str, max_common_len) key_line += total_key_str + " |" val_line += total_val_str + " |" return debugger_cli_common.rich_text_lines_from_rich_line_list( [key_line, val_line]) if not isinstance(tensor, np.ndarray) or not np.size(tensor): return debugger_cli_common.RichTextLines( ["No numeric summary available due to empty tensor."]) elif (np.issubdtype(tensor.dtype, np.float) or np.issubdtype(tensor.dtype, np.complex) or np.issubdtype(tensor.dtype, np.integer)): counts = [("nan", np.sum(np.isnan(tensor))), ("-inf", np.sum(np.isneginf(tensor))), ("-", np.sum( np.logical_and(tensor < 0.0, np.logical_not(np.isneginf(tensor))))), ("0", np.sum(tensor == 0.0)), ("+", np.sum( np.logical_and(tensor > 0.0, np.logical_not(np.isposinf(tensor))))), ("+inf", np.sum(np.isposinf(tensor)))] output = _counts_summary(counts, total_count=np.size(tensor)) valid_array = tensor[np.logical_not( np.logical_or(np.isinf(tensor), np.isnan(tensor)))] if np.size(valid_array): stats = [("min", np.min(valid_array)), ("max", np.max(valid_array)), ("mean", np.mean(valid_array)), ("std", np.std(valid_array))] output.extend(_counts_summary(stats, skip_zeros=False)) return output elif tensor.dtype == np.bool: counts = [ ("False", np.sum(tensor == 0)), ("True", np.sum(tensor > 0)), ] return _counts_summary(counts, total_count=np.size(tensor)) else: return debugger_cli_common.RichTextLines([ "No numeric summary available due to tensor dtype: %s." % tensor.dtype ])
def mvstdnormcdf(lower, upper, corrcoef, **kwds): '''standardized multivariate normal cumulative distribution function This is a wrapper for scipy.stats.kde.mvn.mvndst which calculates a rectangular integral over a standardized multivariate normal distribution. This function assumes standardized scale, that is the variance in each dimension is one, but correlation can be arbitrary, covariance = correlation matrix Parameters ---------- lower, upper : array_like, 1d lower and upper integration limits with length equal to the number of dimensions of the multivariate normal distribution. It can contain -np.inf or np.inf for open integration intervals corrcoef : float or array_like specifies correlation matrix in one of three ways, see notes optional keyword parameters to influence integration * maxpts : int, maximum number of function values allowed. This parameter can be used to limit the time. A sensible strategy is to start with `maxpts` = 1000*N, and then increase `maxpts` if ERROR is too large. * abseps : float absolute error tolerance. * releps : float relative error tolerance. Returns ------- cdfvalue : float value of the integral Notes ----- The correlation matrix corrcoef can be given in 3 different ways If the multivariate normal is two-dimensional than only the correlation coefficient needs to be provided. For general dimension the correlation matrix can be provided either as a one-dimensional array of the upper triangular correlation coefficients stacked by rows, or as full square correlation matrix See Also -------- mvnormcdf : cdf of multivariate normal distribution without standardization Examples -------- >>> print mvstdnormcdf([-np.inf,-np.inf], [0.0,np.inf], 0.5) 0.5 >>> corr = [[1.0, 0, 0.5],[0,1,0],[0.5,0,1]] >>> print mvstdnormcdf([-np.inf,-np.inf,-100.0], [0.0,0.0,0.0], corr, abseps=1e-6) 0.166666399198 >>> print mvstdnormcdf([-np.inf,-np.inf,-100.0],[0.0,0.0,0.0],corr, abseps=1e-8) something wrong completion with ERROR > EPS and MAXPTS function values used; increase MAXPTS to decrease ERROR; 1.048330348e-006 0.166666546218 >>> print mvstdnormcdf([-np.inf,-np.inf,-100.0],[0.0,0.0,0.0], corr, maxpts=100000, abseps=1e-8) 0.166666588293 ''' n = len(lower) #don't know if converting to array is necessary, #but it makes ndim check possible lower = np.array(lower) upper = np.array(upper) corrcoef = np.array(corrcoef) correl = np.zeros(n * (n - 1) / 2.0) #dtype necessary? if (lower.ndim != 1) or (upper.ndim != 1): raise ValueError, 'can handle only 1D bounds' if len(upper) != n: raise ValueError, 'bounds have different lengths' if n == 2 and corrcoef.size == 1: correl = corrcoef #print 'case scalar rho', n elif corrcoef.ndim == 1 and len(corrcoef) == n * (n - 1) / 2.0: #print 'case flat corr', corrcoeff.shape correl = corrcoef elif corrcoef.shape == (n, n): #print 'case square corr', correl.shape correl = corrcoef[np.tril_indices(n, -1)] # for ii in range(n): # for jj in range(ii): # correl[ jj + ((ii-2)*(ii-1))/2] = corrcoef[ii,jj] else: raise ValueError, 'corrcoef has incorrect dimension' if not 'maxpts' in kwds: if n > 2: kwds['maxpts'] = 10000 * n lowinf = np.isneginf(lower) uppinf = np.isposinf(upper) infin = 2.0 * np.ones(n) np.putmask(infin, lowinf, 0) # infin.putmask(0,lowinf) np.putmask(infin, uppinf, 1) #infin.putmask(1,uppinf) #this has to be last np.putmask(infin, lowinf * uppinf, -1) ## #remove infs ## np.putmask(lower,lowinf,-100)# infin.putmask(0,lowinf) ## np.putmask(upper,uppinf,100) #infin.putmask(1,uppinf) #print lower,',',upper,',',infin,',',correl #print correl.shape #print kwds.items() error, cdfvalue, inform = scipy.stats.kde.mvn.mvndst( lower, upper, infin, correl, **kwds) if inform: print 'something wrong', informcode[inform], error return cdfvalue
def RDP_depend_pate_gaussian(params, alpha): """ Return the data-dependent RDP of GNMAX (proposed in PATE2) Bounds RDP from above of GNMax given an upper bound on q (Theorem 6). Args: logq: Natural logarithm of the probability of a non-argmax outcome. sigma: Standard deviation of Gaussian noise. orders: An array_like list of Renyi orders. Returns: Upper bound on RPD for all orders. A scalar if orders is a scalar. Raises: ValueError: If the input is malformed. """ logq = params['logq'] sigma = params['sigma'] if alpha == 1: p = np.exp(logq) w = (2 * p - 1) * (logq - _log1mexp(logq)) return w if logq > 0 or sigma < 0 or np.any(alpha < 1): # not defined for alpha=1 raise ValueError("Inputs are malformed.") if np.isneginf(logq): # If the mechanism's output is fixed, it has 0-DP. print('isneginf', logq) if np.isscalar(alpha): return 0. else: return np.full_like(alpha, 0., dtype=np.float) variance = sigma**2 # Use two different higher orders: mu_hi1 and mu_hi2 computed according to # Proposition 10. mu_hi2 = math.sqrt(variance * -logq) mu_hi1 = mu_hi2 + 1 orders_vec = np.atleast_1d(alpha) ret = orders_vec / variance # baseline: data-independent bound # Filter out entries where data-dependent bound does not apply. mask = np.logical_and(mu_hi1 > orders_vec, mu_hi2 > 1) rdp_hi1 = mu_hi1 / variance rdp_hi2 = mu_hi2 / variance log_a2 = (mu_hi2 - 1) * rdp_hi2 # Make sure q is in the increasing wrt q range and A is positive. if (np.any(mask) and logq <= log_a2 - mu_hi2 * (math.log(1 + 1 / (mu_hi1 - 1)) + math.log(1 + 1 / (mu_hi2 - 1))) and -logq > rdp_hi2): # Use log1p(x) = log(1 + x) to avoid catastrophic cancellations when x ~ 0. log1q = _log1mexp(logq) # log1q = log(1-q) log_a = (alpha - 1) * (log1q - _log1mexp( (logq + rdp_hi2) * (1 - 1 / mu_hi2))) log_b = (alpha - 1) * (rdp_hi1 - logq / (mu_hi1 - 1)) # Use logaddexp(x, y) = log(e^x + e^y) to avoid overflow for large x, y. log_s1 = utils.stable_logsumexp_two(log1q + log_a, logq + log_b) log_s = np.logaddexp(log1q + log_a, logq + log_b) ret[mask] = np.minimum(ret, log_s / (alpha - 1))[mask] # print('alpha ={} mask {}'.format(alpha,ret)) if ret[mask] < 0: print('negative ret', ret) print('log_s1 ={} log_s = {}'.format(log_s1, log_s)) print('alpha = {} mu_hi1 ={}'.format(alpha, mu_hi1)) print('log1q = {} log_a = {} log_b={} log_s = {}'.format( log1q, log_a, log_b, log_s)) ret[mask] = 1. / (sigma**2) * alpha # print('replace ret with', ret) assert np.all(ret >= 0) if np.isscalar(alpha): return np.asscalar(ret) else: return ret
lambda x: x.dot(np.eye(x.shape[-1])), lambda x: da.tensordot(x, np.ones(x.shape[:2]), axes=[(0, 1), (0, 1)]), lambda x: x.sum(axis=0), lambda x: x.max(axis=0), lambda x: x.sum(axis=(1, 2)), lambda x: x.astype(np.complex128), lambda x: x.map_blocks(lambda x: x * 2), lambda x: x.map_overlap(lambda x: x * 2, depth=0, trim=True, boundary="none"), lambda x: x.map_overlap(lambda x: x * 2, depth=0, trim=False, boundary="none"), lambda x: x.round(1), lambda x: x.reshape((x.shape[0] * x.shape[1], x.shape[2])), lambda x: abs(x), lambda x: x > 0.5, lambda x: x.rechunk((4, 4, 4)), lambda x: x.rechunk((2, 2, 1)), lambda x: np.isneginf(x), lambda x: np.isposinf(x), pytest.param( lambda x: np.zeros_like(x), marks=pytest.mark.xfail( SPARSE_VERSION < parse_version("0.13.0"), reason="https://github.com/pydata/xarray/issues/5654", ), ), pytest.param( lambda x: np.ones_like(x), marks=pytest.mark.xfail( SPARSE_VERSION < parse_version("0.13.0"), reason="https://github.com/pydata/xarray/issues/5654", ), ),
def GetDatasetsProto(self, datasets, features=None): """Generates the feature stats proto from dictionaries of feature values. Args: datasets: An array of dictionaries, one per dataset, each one containing: - 'entries': The dictionary of features in the dataset from the parsed examples. - 'size': The number of examples parsed for the dataset. - 'name': The name of the dataset. features: A list of strings that is a whitelist of feature names to create feature statistics for. If set to None then all features in the dataset are analyzed. Defaults to None. Returns: The feature statistics proto for the provided datasets. """ features_seen = set() whitelist_features = set(features) if features else None all_datasets = self.datasets_proto() # TODO(jwexler): Add ability to generate weighted feature stats # if there is a specified weight feature in the dataset. # Initialize each dataset for dataset in datasets: all_datasets.datasets.add(name=dataset['name'], num_examples=dataset['size']) # This outer loop ensures that for each feature seen in any of the provided # datasets, we check the feature once against all datasets. for outer_dataset in datasets: for key, value in outer_dataset['entries'].items(): # If we have a feature whitelist and this feature is not in the # whitelist then do not process it. # If we have processed this feature already, no need to do it again. if ((whitelist_features and key not in whitelist_features) or key in features_seen): continue features_seen.add(key) # Default to type int if no type is found, so that the fact that all # values are missing from this feature can be displayed. feature_type = value[ 'type'] if 'type' in value else self.fs_proto.INT # Process the found feature for each dataset. for j, dataset in enumerate(datasets): feat = all_datasets.datasets[j].features.add( type=feature_type, name=key) value = dataset['entries'].get(key) has_data = value is not None and ( value['vals'].size != 0 if isinstance( value['vals'], np.ndarray) else value['vals']) commonstats = None # For numeric features, calculate numeric statistics. if feat.type in (self.fs_proto.INT, self.fs_proto.FLOAT): featstats = feat.num_stats commonstats = featstats.common_stats if has_data: nums = value['vals'] featstats.std_dev = np.asscalar(np.std(nums)) featstats.mean = np.asscalar(np.mean(nums)) featstats.min = np.asscalar(np.min(nums)) featstats.max = np.asscalar(np.max(nums)) featstats.median = np.asscalar(np.median(nums)) featstats.num_zeros = len(nums) - np.count_nonzero( nums) nums = np.array(nums) num_nan = len(nums[np.isnan(nums)]) num_posinf = len(nums[np.isposinf(nums)]) num_neginf = len(nums[np.isneginf(nums)]) # Remove all non-finite (including NaN) values from the numeric # values in order to calculate histogram buckets/counts. The # inf values will be added back to the first and last buckets. nums = nums[np.isfinite(nums)] counts, buckets = np.histogram(nums) hist = featstats.histograms.add() hist.type = self.histogram_proto.STANDARD hist.num_nan = num_nan for bucket_count in range(len(counts)): bucket = hist.buckets.add( low_value=buckets[bucket_count], high_value=buckets[bucket_count + 1], sample_count=np.asscalar( counts[bucket_count])) # Add any negative or positive infinities to the first and last # buckets in the histogram. if bucket_count == 0 and num_neginf > 0: bucket.low_value = float('-inf') bucket.sample_count += num_neginf elif bucket_count == len( counts) - 1 and num_posinf > 0: bucket.high_value = float('inf') bucket.sample_count += num_posinf if not hist.buckets: if num_neginf: hist.buckets.add(low_value=float('-inf'), high_value=float('-inf'), sample_count=num_neginf) if num_posinf: hist.buckets.add(low_value=float('inf'), high_value=float('inf'), sample_count=num_posinf) self._PopulateQuantilesHistogram( featstats.histograms.add(), nums.tolist()) elif feat.type == self.fs_proto.STRING: featstats = feat.string_stats commonstats = featstats.common_stats if has_data: strs = value['vals'] featstats.avg_length = np.mean( np.vectorize(len)(strs)) vals, counts = np.unique(strs, return_counts=True) featstats.unique = len(vals) sorted_vals = sorted(zip(counts, vals), reverse=True) for val_index, val in enumerate(sorted_vals): if val[1].dtype.type is np.str_: printable_val = val[1] else: try: printable_val = val[1].decode( 'UTF-8', 'strict') except UnicodeDecodeError: printable_val = '__BYTES_VALUE__' bucket = featstats.rank_histogram.buckets.add( low_rank=val_index, high_rank=val_index, sample_count=np.asscalar(val[0]), label=printable_val) if val_index < 2: featstats.top_values.add( value=bucket.label, frequency=bucket.sample_count) # Add the common stats regardless of the feature type. if has_data: commonstats.num_missing = value['missing'] commonstats.num_non_missing = ( all_datasets.datasets[j].num_examples - featstats.common_stats.num_missing) commonstats.min_num_values = np.asscalar( np.min(value['counts'])) commonstats.max_num_values = np.asscalar( np.max(value['counts'])) commonstats.avg_num_values = np.asscalar( np.mean(value['counts'])) if 'feat_lens' in value and value['feat_lens']: self._PopulateQuantilesHistogram( commonstats.feature_list_length_histogram, value['feat_lens']) self._PopulateQuantilesHistogram( commonstats.num_values_histogram, value['counts']) else: commonstats.num_non_missing = 0 commonstats.num_missing = all_datasets.datasets[ j].num_examples return all_datasets
def eval_node_probs(self): """Update probability density estimates. """ if (self.mimic_speed == False): # Create mutual info matrix mutual_info = np.zeros([self.length, self.length]) for i in range(self.length - 1): for j in range(i + 1, self.length): mutual_info[i, j] = -1 * mutual_info_score( self.keep_sample[:, i], self.keep_sample[:, j]) elif (self.mimic_speed == True): # Set ignore error to ignore dividing by zero np.seterr(divide='ignore', invalid='ignore') # get length of the sample which survived from mimic iteration len_sample_kept = self.keep_sample.shape[0] # get the length of the bit sequence / problem size len_prob = self.keep_sample.shape[1] # Expand the matrices to so each row corresponds to a row by row combination of the list of samples permuted_rows = np.repeat(self.keep_sample, self.length).reshape( len_sample_kept, len_prob * len_prob) duplicated_rows = np.hstack(([self.keep_sample] * len_prob)) # Compute the mutual information matrix in bulk # This is done by iterating through the list of possible feature values ((max_val-1)^2). # For example, a binary string would go through 00 01 10 11, for a total of 4 iterations. # First initialize the mutual info matrix. mutual_info_vectorized = np.zeros([self.length * self.length]) # Pre-compute the clusters U and V which gets computed multiple times in the inner loop. cluster_U = {} cluster_V = {} cluster_U_sum = {} cluster_V_sum = {} for i in range(0, self.max_val): cluster_U[i] = (duplicated_rows == i) cluster_V[i] = (permuted_rows == i) cluster_U_sum[i] = np.sum(duplicated_rows == i, axis=0) cluster_V_sum[i] = np.sum(permuted_rows == i, axis=0) # Compute the mutual information for all sample to sample combination # Done for each feature combination i & j ((max_val-1)^2) for i in range(0, self.max_val): for j in range(0, self.max_val): # |U_i AND V_j|/N Length of cluster matching for feature pair i j over sample length N # This is the first term in the MI computation MI_first_term = np.sum(cluster_U[i] * cluster_V[j], axis=0) MI_first_term = np.divide(MI_first_term, len_sample_kept) # compute the second term of the MI matrix # Length |U_i||V_j|, for the particular feature pair UV_length = (cluster_U_sum[i] * cluster_V_sum[j]) MI_second_term = np.log(MI_first_term) - np.log( UV_length) + np.log(len_sample_kept) # remove the nans and negative infinity, there shouldn't be any MI_second_term[np.isnan(MI_second_term)] = 0 MI_second_term[np.isneginf(MI_second_term)] = 0 # Combine the first and second term # Add the whole MI matrix for the feature to the previously computed values mutual_info_vectorized = mutual_info_vectorized + MI_first_term * MI_second_term # Need to multiply by negative to get the mutual information, and reshape (Full Matrix) mutual_info_full = -mutual_info_vectorized.reshape( self.length, self.length) # Only get the upper triangle matrix above the identity row. mutual_info = np.triu(mutual_info_full, k=1) # Possible enhancements, currently we are doing double the computation required. # Pre set the matrix so the computation is only done for rows that are needed. To do for the future. # Find minimum spanning tree of mutual info matrix mst = minimum_spanning_tree(csr_matrix(mutual_info)) # Convert minimum spanning tree to depth first tree with node 0 as root dft = depth_first_tree(csr_matrix(mst.toarray()), 0, directed=False) dft = np.round(dft.toarray(), 10) # Determine parent of each node parent = np.argmin(dft[:, 1:], axis=0) # Get probs probs = np.zeros([self.length, self.max_val, self.max_val]) probs[0, :] = np.histogram(self.keep_sample[:, 0], np.arange(self.max_val + 1), density=True)[0] for i in range(1, self.length): for j in range(self.max_val): subset = self.keep_sample[np.where( self.keep_sample[:, parent[i - 1]] == j)[0]] if not len(subset): probs[i, j] = 1 / self.max_val else: probs[i, j] = np.histogram(subset[:, i], np.arange(self.max_val + 1), density=True)[0] # Update probs and parent self.node_probs = probs self.parent_nodes = parent
def mvstdnormcdf(lower, upper, corrcoef,maxpts = None, **kwds): '''standardized multivariate normal cumulative distribution function This is a wrapper for scipy.stats.kde.mvn.mvndst which calculates a rectangular integral over a standardized multivariate normal distribution. This function assumes standardized scale, that is the variance in each dimension is one, but correlation can be arbitrary, covariance = correlation matrix Parameters ---------- lower, upper : array_like, 1d lower and upper integration limits with length equal to the number of dimensions of the multivariate normal distribution. It can contain -np.inf or np.inf for open integration intervals corrcoef : float or array_like specifies correlation matrix in one of three ways, see notes optional keyword parameters to influence integration * maxpts : int, maximum number of function values allowed. This parameter can be used to limit the time. A sensible strategy is to start with `maxpts` = 1000*N, and then increase `maxpts` if ERROR is too large. * abseps : float absolute error tolerance. * releps : float relative error tolerance. Returns ------- cdfvalue : float value of the integral Notes ----- The correlation matrix corrcoef can be given in 3 different ways If the multivariate normal is two-dimensional than only the correlation coefficient needs to be provided. For general dimension the correlation matrix can be provided either as a one-dimensional array of the upper triangular correlation coefficients stacked by rows, or as full square correlation matrix See Also -------- mvnormcdf : cdf of multivariate normal distribution without standardization Examples -------- >>> print mvstdnormcdf([-np.inf,-np.inf], [0.0,np.inf], 0.5) 0.5 >>> corr = [[1.0, 0, 0.5],[0,1,0],[0.5,0,1]] >>> assert Matrix(0.166666399198) == mvstdnormcdf( ... [-np.inf,-np.inf,-100.0], ... [0.0,0.0,0.0], ... corr, abseps=2e-6 ... ) >>> >>> assert Matrix(0.166666588293) == mvstdnormcdf( ... [-np.inf,-np.inf,-100.0], ... [ 0.0, 0.0, 0.0], ... corr, abseps=1e-8) #doctest: +IGNORE_EXCEPTION_DETAIL Traceback (most recent call last): ... MvnDstError: completion with ERROR > EPS and MAXPTS function values used; increase MAXPTS to decrease ERROR, ERROR = 1.8253048422e-07 >>> assert Matrix(0.166666588293) == mvstdnormcdf( ... [-np.inf,-np.inf,-100.0], ... [0.0,0.0,0.0], ... corr,maxpts=1000000, abseps=1e-8 ... ) ''' n = len(lower) #don't know if converting to array is necessary, #but it makes ndim check possible lower = np.array(lower) upper = np.array(upper) corrcoef = np.array(corrcoef) correl = np.zeros(n*(n-1)/2.0) #dtype necessary? if (lower.ndim != 1) or (upper.ndim != 1): raise ValueError, 'can handle only 1D bounds' if len(upper) != n: raise ValueError, 'bounds have different lengths' if n==2 and corrcoef.size==1: correl = corrcoef #print 'case scalar rho', n elif corrcoef.ndim == 1 and len(corrcoef) == n*(n-1)/2.0: #print 'case flat corr', corrcoeff.shape correl = corrcoef elif corrcoef.shape == (n,n): correl = corrcoef[np.tri(n,n,-1,dtype=bool)] else: raise ValueError, 'corrcoef has incorrect dimension' if maxpts is None: maxpts = 10000*n lowinf = np.isneginf(lower) uppinf = np.isposinf(upper) infin = 2.0*np.ones(n) infin[lowinf] = 0 infin[uppinf] = 1 infin[lowinf & uppinf] = -1 error, cdfvalue, inform = mvndst(lower,upper,infin,correl,maxpts,**kwds) if inform: raise MvnDstError(inform, error) return cdfvalue
def makeDailyChannelOffsetSignal( ): from functions.TAfunctions import SMA, MoveMax, jumpTheChannelTest import functions.allstats from functions.UpdateSymbols_inHDF5 import * from functions.GetParams import GetParams file4path = os.path.join( os.getcwd(), "pyTAAAweb_DailyChannelOffsetSignal_status.params" ) figure4path = os.path.join( os.getcwd(), "pyTAAA_web", "PyTAAA_DailyChannelOffsetSignalV.png" ) symbol_directory = os.path.join( os.getcwd(), "symbols" ) symbol_file = "Naz100_Symbols.txt" symbols_file = os.path.join( symbol_directory, symbol_file ) adjClose, symbols, datearray, _, _ = loadQuotes_fromHDF( symbols_file ) ### ### get last date already processed ### _dates = [] avgPctChannel = [] numAboveBelowChannel = [] try: with open( file4path, "r" ) as f: # get number of lines in file lines = f.read().split("\n") numlines = len (lines) for i in range(numlines): statusline = lines[i] statusline_list = statusline.split(" ") statusline_list = filter(None, statusline_list) if len( statusline_list ) == 3: _dates.append( datetime.datetime.strptime( statusline_list[0], '%Y-%m-%d') ) avgPctChannel.append( float(statusline_list[1].split('%')[0])/100. ) numAboveBelowChannel.append( float(statusline_list[2]) ) except: print " Error: unable to read updates from pyTAAAweb_numberUptrendingStocks_status.params" print "" #print "_dates = ", _dates last_date = _dates[-1].date() print " ...inside makeDailyChannelOffsetSignal... last_date = ", last_date # parameters for signal params = GetParams() minperiod = params['minperiod'] maxperiod = params['maxperiod'] incperiod = params['incperiod'] numdaysinfit = params['numdaysinfit'] offset = params['offset'] print "minperiod,maxperiod,incperiod,numdaysinfit,offset = ", minperiod,maxperiod,incperiod,numdaysinfit,offset # process for each date print "\n ... inside makeDailyChannelOffsetSignal ..." dailyChannelOffsetSignal = np.zeros( adjClose.shape[1], 'float' ) dailyCountDowntrendChannelOffsetSignal = np.zeros( adjClose.shape[1], 'float' ) #for idate in range(numdaysinfit+incperiod,adjClose.shape[1]) for idate in range(adjClose.shape[1]): if datearray[idate] >= last_date : #if datearray[idate] > datetime.date(1992,1,1) : #if datearray[idate] > datetime.date(1992,1,1) : if idate%10 == 0: print " ...idate, datearray[idate] = ", idate, datearray[idate] # process all symbols numberDowntrendSymbols = 0 dailyChannelPct = [] ##print " ... symbols = ", symbols floatChannelGainsLosses = [] floatStdevsAboveChannel = [] for i, symbol in enumerate(symbols): #print " ... symbol = ", symbol quotes = adjClose[i,idate-numdaysinfit-offset-1:idate].copy() channelGainLoss, numStdDevs, pctChannel = \ recentTrendAndStdDevs( \ quotes, \ datearray,\ minperiod=minperiod,\ maxperiod=maxperiod,\ incperiod=incperiod,\ numdaysinfit=numdaysinfit,\ offset=offset) floatChannelGainsLosses.append(channelGainLoss) floatStdevsAboveChannel.append(numStdDevs) floatChannelGainsLosses = np.array(floatChannelGainsLosses) floatChannelGainsLosses[np.isinf(floatChannelGainsLosses)] = -999. floatChannelGainsLosses[np.isneginf(floatChannelGainsLosses)] = -999. floatChannelGainsLosses[np.isnan(floatChannelGainsLosses)] = -999. floatChannelGainsLosses = floatChannelGainsLosses[floatChannelGainsLosses != -999.] floatStdevsAboveChannel = np.array(floatStdevsAboveChannel) floatStdevsAboveChannel[np.isinf(floatStdevsAboveChannel)] = -999. floatStdevsAboveChannel[np.isneginf(floatStdevsAboveChannel)] = -999. floatStdevsAboveChannel[np.isnan(floatStdevsAboveChannel)] = -999. floatStdevsAboveChannel = floatStdevsAboveChannel[floatStdevsAboveChannel != -999.] ##print "floatChannelGainsLosses.shape = ", floatChannelGainsLosses.shape trimmeanGains = np.mean(floatChannelGainsLosses[np.logical_and(\ floatChannelGainsLosses>np.percentile(floatChannelGainsLosses,5),\ floatChannelGainsLosses<np.percentile(floatChannelGainsLosses,95)\ )]) trimmeanStdevsAboveChannel = np.mean(floatStdevsAboveChannel[np.logical_and(\ floatStdevsAboveChannel>np.percentile(floatStdevsAboveChannel,5),\ floatStdevsAboveChannel<np.percentile(floatStdevsAboveChannel,95)\ )]) #print "idate= ",idate,str(datearray[idate]) textmessage2 = '' with open( file4path, "a" ) as ff: textmessage2 = "\n"+str(datearray[idate])+" "+\ format(trimmeanGains,"8.2%")+" "+\ format(trimmeanStdevsAboveChannel,"7.1f") ff.write(textmessage2) print "textmessage2 = ", textmessage2 #print "idate= ",idate, str(datearray[idate]) ########################################## # make plot ########################################## ### ### make a combined plot ### 1. get percent of uptrending stocks ### _dates = [] avgPctChannel = [] numAboveBelowChannel = [] try: with open( file4path, "r" ) as f: # get number of lines in file lines = f.read().split("\n") numlines = len (lines) for i in range(numlines): statusline = lines[i] statusline_list = statusline.split(" ") statusline_list = filter(None, statusline_list) if len( statusline_list ) == 3: _dates.append( datetime.datetime.strptime( statusline_list[0], '%Y-%m-%d') ) avgPctChannel.append( float(statusline_list[1].split('%')[0])/100. ) numAboveBelowChannel.append( float(statusline_list[2]) ) except: print " Error: unable to read updates from pyTAAAweb_numberUptrendingStocks_status.params" print "" _dates = np.array(_dates) avgPctChannel = np.array(avgPctChannel) numAboveBelowChannel = np.array(numAboveBelowChannel) print " avgPctChannel min, mean, max = ", avgPctChannel.min(),avgPctChannel.mean(),avgPctChannel.max() print "\n\n numAboveBelowChannel = ", numAboveBelowChannel print " numAboveBelowChannel min, mean, max = ", numAboveBelowChannel.min(),numAboveBelowChannel.mean(),numAboveBelowChannel.max() plt.figure(4,figsize=(9,7)) plt.clf() plt.grid(True) numDaysToPlot = 252*3 plt.plot( _dates[-numDaysToPlot:], np.clip(avgPctChannel[-numDaysToPlot:]*100.,-200.,200.), 'r-', lw=.1) plt.plot( _dates[-numDaysToPlot:], numAboveBelowChannel[-numDaysToPlot:], 'b-', lw=.25) plt.title("pyTAAA History Plot\nChannel Offset Signal") plt.savefig(figure4path) figure4path = 'PyTAAA_DailyChannelOffsetSignalV2.png' # re-set to name without full path figure4_htmlText = "\n<br><h3>Channel Offset Signal</h3>\n" figure4_htmlText = figure4_htmlText + "\nPlot shows up/down trending in last few days compared to trend for stocks in Nasdaq 100.\n" figure4_htmlText = figure4_htmlText + '''<br><img src="'''+figure4path+'''" alt="PyTAAA by DonaldPG" width="850" height="500"><br>\n''' ### ### make a combined plot ### 2. make plot showing trend below B&H and trade-system Value ### file3path = os.path.join( os.getcwd(), "pyTAAAweb_backtestPortfolioValue.params" ) backtestDate = [] backtestBHvalue = [] backtestSystemvalue = [] try: with open( file3path, "r" ) as f: # get number of lines in file lines = f.read().split("\n") numlines = len (lines) for i in range(numlines): try: statusline = lines[i] statusline_list = statusline.split(" ") if len( statusline_list ) == 5: backtestDate.append( datetime.datetime.strptime( statusline_list[0], '%Y-%m-%d') ) backtestBHvalue.append( float(statusline_list[2]) ) backtestSystemvalue.append( float(statusline_list[4]) ) except: break except: print " Error: unable to read updates from pyTAAAweb_backtestPortfolioValue.params" print "" figure5path = os.path.join( os.getcwd(), "pyTAAA_web", "PyTAAA_backtestWithOffsetChannelSignal.png" ) plt.figure(5,figsize=(9,7)) plt.clf() subplotsize = gridspec.GridSpec(2,1,height_ratios=[5,3]) plt.subplot(subplotsize[0]) plt.grid(True) plt.yscale('log') plotmax = 1.e10 plt.ylim([1000,max(10000,plotmax)]) numDaysToPlot = 252*10 numDaysToPlot = len( backtestBHvalue ) plt.plot( backtestDate[-numDaysToPlot:], backtestBHvalue[-numDaysToPlot:], 'r-', lw=1.25, label='Buy & Hold') plt.plot( backtestDate[-numDaysToPlot:], backtestSystemvalue[-numDaysToPlot:], 'k-', lw=1.25, label='Trading System') plt.legend(loc=2,prop={'size':9}) plt.title("pyTAAA History Plot\n Portfolio Value") plt.text( backtestDate[-numDaysToPlot+50], 2500, "Backtest updated "+datetime.datetime.now().strftime("%A, %d. %B %Y %I:%M%p"), fontsize=7.5 ) plt.subplot(subplotsize[1]) plt.grid(True) plt.ylim(-100, 100) plt.plot( _dates[-numDaysToPlot:], np.clip(avgPctChannel[-numDaysToPlot:]*100.,-200.,200.), 'r-', lw=.1, label='avg Pct offset channel') plt.plot( _dates[-numDaysToPlot:], numAboveBelowChannel[-numDaysToPlot:], 'b-', lw=.25, label='number above/below offset channel') plt.legend(loc=3,prop={'size':6}) plt.savefig(figure5path) figure5path = 'PyTAAA_backtestWithOffsetChannelSignal.png' # re-set to name without full path figure5_htmlText = "\n<br><h3>Daily backtest with offset Channel trend signal</h3>\n" figure5_htmlText = figure5_htmlText + "\nCombined backtest with offset Channel trend signal.\n" figure5_htmlText = figure5_htmlText + '''<br><img src="'''+figure5path+'''" alt="PyTAAA by DonaldPG" width="850" height="500"><br>\n''' return figure4_htmlText, figure5_htmlText
def _output_vectors_text(input_file, vocabulary, scorer, output_file, log_base=None): """Reads text from ``input_file``, computes perplexity using ``scorer``, and writes to ``output_file``. :type input_file: file object :param input_file: a file that contains the input sentences in SRILM n-best format :type vocabulary: Vocabulary :param vocabulary: vocabulary that provides mapping between words and word IDs :type scorer: TextScorer :param scorer: a text scorer for rescoring the input sentences :type output_file: file object :param output_file: a file where to write the output n-best list in SRILM format :type log_base: int :param log_base: if set to other than None, convert log probabilities to this base """ scoring_iter = \ ScoringBatchIterator(input_file, vocabulary, batch_size=16, max_sequence_length=None, map_oos_to_unk=False) log_scale = 1.0 if log_base is None else numpy.log(log_base) total_logprob = 0.0 num_sentences = 0 num_tokens = 0 num_words = 0 num_probs = 0 num_unks = 0 num_zeroprobs = 0 all_word_ids = numpy.arange(vocabulary.num_words()) all_class_ids, membership_probs = vocabulary.get_class_memberships( all_word_ids) for word_ids, words, mask in scoring_iter: class_ids, _ = vocabulary.get_class_memberships(word_ids) membership_probs_output_vec = numpy.tile( membership_probs, (word_ids.shape[0], word_ids.shape[1], 1)) logprobs = scorer.score_batch_output(word_ids, class_ids, all_class_ids, membership_probs_output_vec, mask) for seq_index, seq_logprobs in enumerate(logprobs): seq_word_ids = word_ids[:, seq_index] seq_mask = mask[:, seq_index] seq_word_ids = seq_word_ids[seq_mask == 1] seq_words = words[seq_index] #TODO: Rename the variables properly to remove the hack below merged_words, merged_logprobs = seq_words, seq_logprobs # total logprob of this sequence seq_logprob = sum( lp[seq_word_ids[idx + 1]] for idx, lp in enumerate(merged_logprobs) if (lp[seq_word_ids[idx + 1]] is not None) and ( not numpy.isneginf(lp[seq_word_ids[idx + 1]]))) # total logprob of all sequences total_logprob += seq_logprob # number of tokens, which may be subwords, including <unk>'s num_tokens += len(seq_word_ids) # number of words, including <s>'s and <unk>'s num_words += len(merged_words) # number of word probabilities computed (may not include <unk>'s) num_seq_probs = sum((lp[seq_word_ids[idx + 1]] is not None) and ( not numpy.isneginf(lp[seq_word_ids[idx + 1]])) for idx, lp in enumerate(merged_logprobs)) num_probs += num_seq_probs # number of unks and zeroprobs (just for reporting) num_unks += sum(lp[seq_word_ids[idx + 1]] is None for idx, lp in enumerate(merged_logprobs)) num_zeroprobs += sum((lp[seq_word_ids[idx + 1]] is not None) and numpy.isneginf(lp[seq_word_ids[idx + 1]]) for idx, lp in enumerate(merged_logprobs)) # number of sequences num_sentences += 1 output_file.write("# Sentence {0}\n".format(num_sentences)) _write_output_vectors(vocabulary, merged_words, merged_logprobs, output_file, log_scale) output_file.write("Sentence perplexity: {0}\n\n".format( numpy.exp(-seq_logprob / num_seq_probs))) output_file.write("Number of sentences: {0}\n".format(num_sentences)) output_file.write("Number of words: {0}\n".format(num_words)) output_file.write("Number of tokens: {0}\n".format(num_tokens)) output_file.write( "Number of predicted probabilities: {0}\n".format(num_probs)) output_file.write("Number of excluded (OOV) words: {0}\n".format(num_unks)) output_file.write( "Number of zero probabilities: {0}\n".format(num_zeroprobs)) if num_words > 0: cross_entropy = -total_logprob / num_probs perplexity = numpy.exp(cross_entropy) output_file.write( "Cross entropy (base e): {0}\n".format(cross_entropy)) if log_base is not None: cross_entropy /= log_scale output_file.write("Cross entropy (base {1}): {0}\n".format( cross_entropy, log_base)) output_file.write("Perplexity: {0}\n".format(perplexity))
def check_all_log_values_are_valid(feature_vector: List[float], value_vector: List[float]) -> bool: return not np.any( np.isneginf(np.concatenate((feature_vector, value_vector))))
def island_abm(rho=0.01, alpha=1.5, phi=0.4, pi=0.4, eps=0.1, lambda_param=1, T=100, N=50, _RNG_SEED=0): """ Islands growth model Parameters ---------- rho : alpha : phi : float, required eps : lambda_param: (Default = 1) T : int, required The number of periods for the simulation N : int, optional (Default = 50) Number of firms _RNG_SEED : int, optional (Default = 0) Random number seen Output ------ GDP : array, length = [,T] Simulated GPD """ # Set random number seed np.random.seed(_RNG_SEED) T_2 = int(T / 2) GDP = np.zeros((T, 1)) # Distributions # Precompute random binomial draws xy = np.random.binomial(1, pi, (T, T)) xy[T_2, T_2] = 1 # Containers s = np.zeros((T, T)) A = np.ones((N, 6)) # Initializations A[:, 1] = T_2 A[:, 2] = T_2 m = np.zeros((T, T)) m[T_2, T_2] = N dest = np.zeros((N, 2)) """ Begin ABM Code """ for t in range(T): w = np.zeros((N, N)) signal = np.zeros((N, N)) for i in range(N): for j in range(N): if i != j: if A[j, 0] == 1: w[i, j] = np.exp(-rho * (np.abs(A[j, 1] - A[i, 1]) + \ np.abs(A[j, 2] - A[i, 2]))) if np.random.rand() < w[i, j]: signal[i, j] = s[int(A[j, 1]), int(A[j, 2])] if A[i, 0] == 1: A[i, 4] = s[int(A[i, 1]), int(A[i, 2])] * \ m[int(A[i, 1]), int(A[i, 2])] ** alpha A[i, 3] = s[int(A[i, 1]), int(A[i, 2])] if A[i, 0] == 3: A[i, 4] = 0 rnd = np.random.rand() if rnd <= 0.25: A[i, 1] += 1 else: if rnd <= 0.5: A[i, 1] -= 1 else: if rnd <= 0.75: A[i, 2] += 1 else: A[i, 2] -= 1 if xy[int(A[i, 1]), int(A[i, 2])] == 1: A[i, 0] = 1 m[int(A[i, 1]), int(A[i, 2])] += 1 if m[int(A[i, 1]), int(A[i, 2])] == 1: s[int(A[i, 1]), int(A[i, 2])] = \ (1 + int(np.random.poisson(lambda_param))) * \ (A[i, 1] + A[i, 2]) + phi * A[i, 5] + np.random.randn() if (A[i, 0] == 1) and (np.random.rand() <= eps): A[i, 0] = 3 A[i, 5] = A[i, 4] m[int(A[i, 1]), int(A[i, 2])] -= 1 if t > T / 100: if A[i, 0] == 2: A[i, 4] = 0 if dest[i, 0] != A[i, 1]: if dest[i, 0] > A[i, 1]: A[i, 1] += 1 else: A[i, 1] -= 1 else: if dest[i, 1] != A[i, 2]: if dest[i, 1] > A[i, 2]: A[i, 2] += 1 else: A[i, 2] -= 1 if (dest[i, 0] == A[i, 1]) and (dest[i, 1] == A[i, 2]): A[i, 0] = 1 m[int(dest[i, 0]), int(dest[i, 1])] += 1 if A[i, 0] == 1: best_sig = np.max(signal[i, :]) if best_sig > s[int(A[i, 1]), int(A[i, 2])]: A[i, 0] = 2 A[i, 5] = A[i, 4] m[int(A[i, 1]), int(A[i, 2])] -= 1 index = np.where(signal[i, :] == best_sig)[0] if index.shape[0] > 1: ind = int(index[int(np.random.uniform(0, len(index)))]) else: ind = int(index) dest[i, 0] = A[ind, 1] dest[i, 1] = A[ind, 2] GDP[t, 0] = np.sum(A[:, 4]) #JH fix around the divide by zero error supressed in original code np.seterr(divide='ignore') log_GDP = np.log(GDP) np.seterr(divide='warn') log_GDP[np.isneginf(log_GDP)] = 0 return log_GDP
def fit(self, X): """Estimate model parameters with the expectation-maximization algorithm. A initialization step is performed before entering the em algorithm. If you want to avoid this step, set the keyword argument init_params to the empty string '' when creating the GMM object. Likewise, if you would like just to do an initialization, set n_iter=0. Parameters ---------- X : array_like, shape (n, n_features) List of n_features-dimensional data points. Each row corresponds to a single data point. """ ## initialization step X = np.asarray(X, dtype=np.float) if X.ndim == 1: X = X[:, np.newaxis] if X.shape[0] < self.n_components: raise ValueError( 'GMM estimation with %s components, but got only %s samples' % (self.n_components, X.shape[0])) max_log_prob = -np.infty for _ in range(self.n_init): if 'm' in self.init_params or not hasattr(self, 'means_'): self.means_ = cluster.KMeans( n_clusters=self.n_components, random_state=self.random_state).fit(X).cluster_centers_ if 'w' in self.init_params or not hasattr(self, 'weights_'): self.weights_ = np.tile(1.0 / self.n_components, self.n_components) if 'c' in self.init_params or not hasattr(self, 'covars_'): cv = np.cov(X.T) + self.min_covar * np.eye(X.shape[1]) if not cv.shape: cv.shape = (1, 1) self.covars_ = \ distribute_covar_matrix_to_match_covariance_type( cv, self.covariance_type, self.n_components) # EM algorithms log_likelihood = [] # reset self.converged_ to False self.converged_ = False for i in range(self.n_iter): # Expectation step curr_log_likelihood, responsibilities = self.score_samples(X) log_likelihood.append(curr_log_likelihood.sum()) # Check for convergence. if i > 0 and abs(log_likelihood[-1] - log_likelihood[-2]) < \ self.thresh: self.converged_ = True break # Maximization step self._do_mstep(X, responsibilities, self.params, self.min_covar) # if the results are better, keep it if self.n_iter: if log_likelihood[-1] > max_log_prob: max_log_prob = log_likelihood[-1] best_params = {'weights': self.weights_, 'means': self.means_, 'covars': self.covars_} # check the existence of an init param that was not subject to # likelihood computation issue. if np.isneginf(max_log_prob) and self.n_iter: raise RuntimeError( "EM algorithm was never able to compute a valid likelihood " + "given initial parameters. Try different init parameters " + "(or increasing n_init) or check for degenerate data.") # self.n_iter == 0 occurs when using GMM within HMM if self.n_iter: self.covars_ = best_params['covars'] self.means_ = best_params['means'] self.weights_ = best_params['weights'] return self
def get_format_func(self, elem, **options): missing_opt = self.check_options(**options) if missing_opt: raise Exception("Missing options: {}".format(missing_opt)) floatmode = options['floatmode'] precision = None if floatmode == 'unique' else options['precision'] suppress_small = options['suppress_small'] sign = options['sign'] infstr = options['infstr'] nanstr = options['nanstr'] exp_format = False pad_left, pad_right = 0, 0 # only the finite values are used to compute the number of digits finite = umath.isfinite(elem) finite_vals = elem[finite] nonfinite_vals = elem[~finite] # choose exponential mode based on the non-zero finite values: abs_non_zero = umath.absolute(finite_vals[finite_vals != 0]) if len(abs_non_zero) != 0: max_val = np.max(abs_non_zero) min_val = np.min(abs_non_zero) with np.errstate(over='ignore'): # division can overflow if max_val >= 1.e8 or (not suppress_small and (min_val < 0.0001 or max_val / min_val > 1000.)): exp_format = True # do a first pass of printing all the numbers, to determine sizes if len(finite_vals) == 0: trim, exp_size, unique = '.', -1, True elif exp_format: trim, unique = '.', True if floatmode == 'fixed': trim, unique = 'k', False strs = (format_float_scientific(x, precision=precision, unique=unique, trim=trim, sign=sign == '+') for x in finite_vals) frac_strs, _, exp_strs = zip(*(s.partition('e') for s in strs)) int_part, frac_part = zip(*(s.split('.') for s in frac_strs)) exp_size = max(len(s) for s in exp_strs) - 1 trim = 'k' precision = max(len(s) for s in frac_part) # this should be only 1 or 2. Can be calculated from sign. pad_left = max(len(s) for s in int_part) # pad_right is only needed for nan length calculation pad_right = exp_size + 2 + precision unique = False else: trim, unique = '.', True if floatmode == 'fixed': trim, unique = 'k', False strs = (format_float_positional(x, precision=precision, fractional=True, unique=unique, trim=trim, sign=sign == '+') for x in finite_vals) int_part, frac_part = zip(*(s.split('.') for s in strs)) pad_left = max(len(s) for s in int_part) pad_right = max(len(s) for s in frac_part) exp_size = -1 if floatmode in ['fixed', 'maxprec_equal']: precision = pad_right unique = False trim = 'k' else: unique = True trim = '.' # account for sign = ' ' by adding one to pad_left if sign == ' ' and not any(np.signbit(finite_vals)): pad_left += 1 # account for nan and inf in pad_left if len(nonfinite_vals) != 0: nanlen, inflen = 0, 0 if np.any(umath.isinf(nonfinite_vals)): neginf = sign != '-' or np.any(np.isneginf(nonfinite_vals)) inflen = len(infstr) + neginf if np.any(umath.isnan(elem)): nanlen = len(nanstr) offset = pad_right + 1 # +1 for decimal pt pad_left = max(nanlen - offset, inflen - offset, pad_left) def print_nonfinite(x): with errstate(invalid='ignore'): if umath.isnan(x): ret = ('+' if sign == '+' else '') + nanstr else: # isinf infsgn = '-' if x < 0 else '+' if sign == '+' else '' ret = infsgn + infstr return ' ' * (pad_left + pad_right + 1 - len(ret)) + ret if exp_format: def print_finite(x): return format_float_scientific(x, precision=precision, unique=unique, trim=trim, sign=sign == '+', pad_left=pad_left, exp_digits=exp_size) else: def print_finite(x): return format_float_positional(x, precision=precision, unique=unique, fractional=True, trim=trim, sign=sign == '+', pad_left=pad_left, pad_right=pad_right) def fmt(x): if umath.isfinite(x): return print_finite(x) else: return print_nonfinite(x) return fmt
def _fit(self, X, w=None, y=None, do_prediction=False): """Estimate model parameters with the EM algorithm. A initialization step is performed before entering the expectation-maximization (EM) algorithm. If you want to avoid this step, set the keyword argument init_params to the empty string '' when creating the GMM object. Likewise, if you would like just to do an initialization, set n_iter=0. Parameters ---------- X : array_like, shape (n, n_features) List of n_features-dimensional data points. Each row corresponds to a single data point. w : array-like, shape = [n_samples] (optional) Sample weights Returns ------- responsibilities : array, shape (n_samples, n_components) Posterior probabilities of each mixture component for each observation. """ # initialization step X = check_array(X, dtype=np.float64, ensure_min_samples=2, estimator=self) if X.shape[0] < self.n_components: raise ValueError( 'GMM estimation with %s components, but got only %s samples' % (self.n_components, X.shape[0])) max_log_prob = -np.infty if self.verbose > 0: print('Expectation-maximization algorithm started.') for init in range(self.n_init): if self.verbose > 0: print('Initialization ' + str(init + 1)) start_init_time = time() if 'm' in self.init_params or not hasattr(self, 'means_'): self.means_ = cluster.KMeans( n_clusters=self.n_components, random_state=self.random_state).fit(X).cluster_centers_ if self.verbose > 1: print('\tMeans have been initialized.') if 'w' in self.init_params or not hasattr(self, 'weights_'): self.weights_ = np.tile(1.0 / self.n_components, self.n_components) if self.verbose > 1: print('\tWeights have been initialized.') if 'c' in self.init_params or not hasattr(self, 'covars_'): cv = np.cov(X.T) + self.min_covar * np.eye(X.shape[1]) if not cv.shape: cv.shape = (1, 1) self.covars_ = \ distribute_covar_matrix_to_match_covariance_type( cv, self.covariance_type, self.n_components) if self.verbose > 1: print('\tCovariance matrices have been initialized.') # EM algorithms current_log_likelihood = None # reset self.converged_ to False self.converged_ = False # this line should be removed when 'thresh' is removed in v0.18 tol = (self.tol if self.thresh is None else self.thresh / float(X.shape[0])) for i in range(self.n_iter): if self.verbose > 0: print('\tEM iteration ' + str(i + 1)) start_iter_time = time() prev_log_likelihood = current_log_likelihood # Expectation step log_likelihoods, responsibilities = self.score_samples(X, w) current_log_likelihood = log_likelihoods.mean() # Check for convergence. # (should compare to self.tol when deprecated 'thresh' is # removed in v0.18) if prev_log_likelihood is not None: change = abs(current_log_likelihood - prev_log_likelihood) if self.verbose > 1: print('\t\tChange: ' + str(change)) if change < tol: self.converged_ = True if self.verbose > 0: print('\t\tEM algorithm converged.') break # Maximization step self._do_mstep(X, w, responsibilities, self.params, self.min_covar) if self.verbose > 1: print('\t\tEM iteration ' + str(i + 1) + ' took {0:.5f}s'.format(time() - start_iter_time)) # if the results are better, keep it if self.n_iter: if current_log_likelihood > max_log_prob: max_log_prob = current_log_likelihood best_params = { 'weights': self.weights_, 'means': self.means_, 'covars': self.covars_ } if self.verbose > 1: print('\tBetter parameters were found.') if self.verbose > 1: print('\tInitialization ' + str(init + 1) + ' took {0:.5f}s'.format(time() - start_init_time)) # check the existence of an init param that was not subject to # likelihood computation issue. if np.isneginf(max_log_prob) and self.n_iter: raise RuntimeError( "EM algorithm was never able to compute a valid likelihood " + "given initial parameters. Try different init parameters " + "(or increasing n_init) or check for degenerate data.") if self.n_iter: self.covars_ = best_params['covars'] self.means_ = best_params['means'] self.weights_ = best_params['weights'] else: # self.n_iter == 0 occurs when using GMM within HMM # Need to make sure that there are responsibilities to output # Output zeros because it was just a quick initialization responsibilities = np.zeros((X.shape[0], self.n_components)) return responsibilities
def estimate_deltas( G, intervened_node: str, n_timesteps: int, start_year: int, start_month: int, country: Optional[str] = "South Sudan", state: Optional[str] = None, ): """ Utility function that estimates Rate of Change (deltas) for the intervened node per timestep. This will use the units that the CAG was parameterized with. WARNING: The state and country should be same as what was passed to G.parameterize() or else you could get mismatched data. Deltas are estimated by percent change between each time step. (i.e, (current - next)/current). Heuristics are in place to handle NAN and INF values. If changed from 0 to 0 (NAN case), then delta = 0. If increasing from 0 (+INF case), then delta = positive absolute mean of all finite deltas. If decreasing from 0 (-INF case), then delta = negative absolute mean of all finite deltas. See function get_true_values to see how the data is aggregated to fill in values for missing time points which calculating the deltas. Args: G: A completely parameterized and quantified CAG with indicators, estimated transition matrx, and indicator values. intervened_node: A string of the full name of the node in which we are intervening on. n_timesteps: Number of time steps. start_year: The starting year (e.g, 2012). start_month: The starting month (1-12). Returns: 1D numpy array of deltas. """ intervener_indicator = list( G.nodes(data=True)[intervened_node]["indicators"].keys())[0] query_base = " ".join([ f"select * from indicator", f"where `Variable` like '{intervener_indicator}'", ]) query_parts = {"base": query_base} if country is not None: check_q = query_parts["base"] + f"and `Country` is '{country}'" check_r = list(engine.execute(check_q)) if check_r == []: warnings.warn( f"Selected Country not found for {intervener_indicator}! Using default settings (South Sudan)" ) query_parts["country"] = f"and `Country` is 'South Sudan'" else: query_parts["country"] = f"and `Country` is '{country}'" if state is not None: check_q = query_parts["base"] + f"and `State` is '{state}'" check_r = list(engine.execute(check_q)) if check_r == []: warnings.warn( f"Selected State not found for {intervener_indicator}! Using default settings (Aggregration over all States)" ) query_parts["state"] = "" else: query_parts["state"] = f"and `State` is '{state}'" unit = list( G.nodes(data=True)[intervened_node]["indicators"].values())[0].unit int_vals = np.zeros(n_timesteps + 1) int_vals[0] = list( G.nodes(data=True)[intervened_node]["indicators"].values())[0].mean year = start_year month = start_month for j in range(1, n_timesteps + 1): query_parts["year"] = f"and `Year` is '{year}'" query_parts["month"] = f"and `Month` is '{month}'" query = " ".join(query_parts.values()) results = list(engine.execute(query)) if results != []: int_vals[j] = np.mean( [float(r["Value"]) for r in results if r["Unit"] == unit]) if month == 12: year = year + 1 month = 1 else: month = month + 1 continue query_parts["month"] = "" query = " ".join(query_parts.values()) results = list(engine.execute(query)) if results != []: int_vals[j] = np.mean( [float(r["Value"]) for r in results if r["Unit"] == unit]) if month == 12: year = year + 1 month = 1 else: month = month + 1 continue query_parts["year"] = "" query = " ".join(query_parts.values()) results = list(engine.execute(query)) if results != []: int_vals[j] = np.mean( [float(r["Value"]) for r in results if r["Unit"] == unit]) if month == 12: year = year + 1 month = 1 else: month = month + 1 continue per_ch = np.roll(int_vals, -1) - int_vals per_ch = per_ch / int_vals per_mean = np.abs(np.mean(per_ch[np.isfinite(per_ch)])) per_ch[np.isnan(per_ch)] = 0 per_ch[np.isposinf(per_ch)] = per_mean per_ch[np.isneginf(per_ch)] = -per_mean return np.delete(per_ch, -1)
def create_discretised_variables(network, data, node_names, bin_count=4, infinite_extremes=True, decimal_places=4, mode='EqualFrequencies', zero_crossing=True, defined_bins: List[Tuple[float, float]] = None): node_names = [str(name) for name in node_names] if defined_bins is None: options = bayesServerDiscovery().DiscretizationOptions() options.setInfiniteExtremes(infinite_extremes) options.setSuggestedBinCount(bin_count) # reads data from either a Pandas dataframe or dask, so will support out of memory and in-memory. data_reader_cmd = bayesianpy.data.DaskDataset(data[node_names]).create_data_reader_command().create() if mode == 'EqualFrequencies': ef = bayesServerDiscovery().EqualFrequencies() elif mode == 'EqualIntervals': ef = bayesServerDiscovery().EqualIntervals() else: raise ValueError("mode not recognised") columns = jp.java.util.Arrays.asList( [bayesServerDiscovery().DiscretizationColumn(name) for name in node_names]) column_intervals = ef.discretize(data_reader_cmd, columns, bayesServerDiscovery().DiscretizationAlgoOptions()) for i, interval in enumerate(column_intervals): intervals = list(interval.getIntervals().toArray()) if zero_crossing: end_point_value = 0.5 zero = bayesServer().Interval(jp.java.lang.Double(jp.java.lang.Double.NEGATIVE_INFINITY), jp.java.lang.Double(end_point_value), bayesServer().IntervalEndPoint.CLOSED, bayesServer().IntervalEndPoint.OPEN) if 0.5 < intervals[0].getMaximum().floatValue(): # if the interval starts and ends at end_point_value then remove it if intervals[0].getMaximum() == end_point_value: intervals.pop(0) else: intervals[0].setMinimum(jp.java.lang.Double(0.5)) intervals[0].setMinimumEndPoint(bayesServer().IntervalEndPoint.CLOSED) intervals = [zero] + intervals v = bayesServer().Variable(node_names[i], bayesServer().VariableValueType.DISCRETE) v.setStateValueType(bayesServer().StateValueType.DOUBLE_INTERVAL) n = bayesServer().Node(v) for interval in intervals: v.getStates().add( bayesServer().State("{}".format(Builder._create_interval_name(interval, decimal_places)), interval)) network.getNodes().add(n) yield n else: for node in node_names: intervals = [] for bin in defined_bins: minEndPoint = bayesServer().IntervalEndPoint.CLOSED maxEndPoint = bayesServer().IntervalEndPoint.OPEN if np.isneginf(float(bin[0])): a = jp.java.lang.Double(jp.java.lang.Double.NEGATIVE_INFINITY) else: a = jp.java.lang.Double(bin[0]) if np.isposinf(float(bin[1])): b = jp.java.lang.Double(jp.java.lang.Double.POSITIVE_INFINITY) else: b = jp.java.lang.Double(bin[1]) intervals.append( bayesServer().Interval(a, b, minEndPoint, maxEndPoint)) v = bayesServer().Variable(node, bayesServer().VariableValueType.DISCRETE) v.setStateValueType(bayesServer().StateValueType.DOUBLE_INTERVAL) n = bayesServer().Node(v) for interval in intervals: v.getStates().add( bayesServer().State("{}".format(Builder._create_interval_name(interval, decimal_places)), interval)) network.getNodes().add(n) yield n
def calculate_log_score(self, pssm): pssm = np.log2(pssm) * 2 np.place(pssm, np.isneginf(pssm), -20) return pssm
def display_rotated_image_and_wfc3_image(combined_image1, flist, wfc3_image, target_font_size, ff = -8.1, log = True, save_filename = 'rotated_img', cmap1 = 'jet', clim1 = None, clim2 = None, save = False, ax1_title = None, ax2_title = None): ''' ######################################################################################################################## #This function displays the rotated multi-slit image next to the WFC3 image #Inputs: # combined_image1: the multi-slit image array # wfc3_image: the file name of the WFC3 image # ff: fudge factor used for additional rotation in rotate_image; default = -8.1 # log: display the log of the image; default = True # save_filename: save images to this filename; default = 'rotated_img' # cmap1: Color map to use; default = None - use default matplotlib colorbar # clim1: lower contrast limit; default = None - use default matplotlib clim # clim2: upper contrast limit; default = None - use default matplotlib clim # save: switch to enable user to save the file (to save_filename); default = False #Output: # if the save keyword is set then the images will be displayed #Calls to: # rotate_image #Called from: # create_image ######################################################################################################################## ''' wfc3_img = pyfits.getdata(wfc3_image, 0) rot_img = rotate_image(combined_image1, 64.0072, 166.002207094, fudge_factor = ff) fig = pylab.figure(figsize = [30, 20]) ax1 = fig.add_subplot(1,2,1) ax2 = fig.add_subplot(1,2,2) #new_colormap = make_custom_colormap() if not cmap1: cmap1 = 'jet' #pdb.set_trace() new_colormap = getattr(matplotlib.cm, cmap1) new_colormap1 = getattr(matplotlib.cm, 'jet') norm1 = colors.Normalize(vmin = np.min(np.log10(wfc3_img)[np.isfinite(np.log10(wfc3_img))]) + 0.01, vmax = np.max(np.log10(wfc3_img)[np.isfinite(np.log10(wfc3_img))])) new_colormap.set_under('white') #pdb.set_trace() ax2.imshow(np.log10(wfc3_img), interpolation = 'nearest', cmap = new_colormap1, norm = norm1) if log: rot_img_log = np.log10(rot_img) nan_indx = np.isnan(rot_img_log) inf_indx = np.isinf(rot_img_log) neg_inf_indx = np.isneginf(rot_img_log) rot_img_log[nan_indx] = 0 rot_img_log[inf_indx] = 0 rot_img_log[neg_inf_indx] = 0 rot_img_log[nan_indx] = np.min(rot_img_log) - 1 rot_img_log[inf_indx] = np.min(rot_img_log) - 1 rot_img_log[neg_inf_indx] = np.min(rot_img_log) - 1 #norm2 = colors.Normalize(vmin = np.min(rot_img_log[np.isfinite(rot_img_log)]) + 0.01, vmax = np.max(rot_img_log[np.isfinite(rot_img_log)])) norm2 = colors.Normalize(vmin = 0, vmax = np.max(rot_img_log[np.isfinite(rot_img_log)])) cax = ax1.imshow(rot_img_log, interpolation = 'nearest', cmap = new_colormap, norm = norm2) else: norm2 = colors.Normalize(vmin = np.min(rot_img) + 0.01, vmax = np.max(rot_img)) cax = ax1.imshow(rot_img, interpolation = 'nearest') ax2.set_xlim(1300, 2000) ax2.set_ylim(1500, 2100) ax1.set_xlim(-100, 1100) ax1.set_ylim(-20, 490) fig.colorbar(cax) #if cmap1: # cax.set_cmap(cmap1) #pdb.set_trace() if clim1: cax.set_clim(clim1, clim2) ax1 = mark_boundaries(flist, slit_size, combined_image1, rot_img, ax1, 64.0072, 166.002207094, fudge_factor = ff, target_font_size = target_font_size) if ax1_title: ax1.set_title(ax1_title) if ax2_title: ax2.set_title(ax2_title) pdb.set_trace() if save: pylab.savefig(save_filename+'.pdf')
def __log(self, x): log = np.log(x) log[np.isneginf(log)] = -1e6 return log
def fit(self, X): """ Run the EM algorithm to specified convergence. Parameters ---------- X : array_like, shape (n,) + d List of data points assumed that the dimensions are such that `np.prod(X.shape[1:])==n_features` """ random_state = check_random_state(self.random_state) X = np.asarray(X, dtype=self.binary_type) if X.ndim == 1: X = X[:, np.newaxis] data_shape = X.shape[1:] # flatten data to just be binary vectors data_length = np.prod(data_shape) if len(data_shape) > 1: X = X.reshape(X.shape[0], data_length) if X.shape[0] < self.n_components: raise ValueError( 'BernoulliMM estimation with %s components, but got only %s samples' % (self.n_components, X.shape[0])) inv_X = 1 - X max_log_prob = -np.infty # if debug_plot: # plw = ag.plot.PlottingWindow(subplots=(1, self.num_mix), figsize=(self.num_mix*3, 3)) for cur_init in range(self.n_init): if self.verbose: print("Current parameter initialization: {0}".format(cur_init)) if 'm' in self.init_params or not hasattr(self, 'means_'): if self.verbose: print("Initializing means") indices = np.arange(X.shape[0]) random_state.shuffle(indices) self.means_ = np.array( tuple( np.clip(X[indices[i::self.n_components]].mean(0), self.min_prob, 1 - self.min_prob) for i in range(self.n_components))) self.log_odds_, self.log_inv_mean_sums_ = _compute_log_odds_inv_means_sums( self.means_) if 'w' in self.init_params or not hasattr(self, 'weights_'): if self.verbose: print("Initializing weights") self.weights_ = np.tile(1.0 / self.n_components, self.n_components) log_likelihood = [] self.iterations = 0 self.converged_ = False for i in range(self.n_iter): # Expectation Step curr_log_likelihood, responsibilities = self.eval(X) log_likelihood.append(curr_log_likelihood.sum()) if self.verbose: print("Iteration {0}: loglikelihood {1}".format( i, log_likelihood[-1])) # check for convergence if i > 0 and abs(log_likelihood[-1] - log_likelihood[-2])/abs(log_likelihood[-2]) < \ self.thresh: self.converged_ = True break # ag.info("Iteration {0}: loglikelihood {1}".format(self.iterations, loglikelihood)) # maximization step self._do_mstep(X, responsibilities, self.params, self.min_prob) if self.n_iter: if log_likelihood[-1] > max_log_prob: if self.verbose: print("updated best params for {0}".format( self.score(X).sum())) max_log_prob = log_likelihood[-1] best_params = { 'weights': self.weights_, 'means': self.means_ } # check the existence of an init param that was not subject to # likelihood computation issue. if np.isneginf(max_log_prob) and self.n_iter: raise RuntimeError( "EM algorithm was never able to compute a valid likelihood " + "given initial parameters. Try different init parameters " + "(or increasing n_init) or check for degenerate data.") if len(data_shape) > 1: X = X.reshape(*((X.shape[0], ) + data_shape)) if self.n_iter: self.means_ = best_params['means'] self.log_odds_, self.log_inv_mean_sums_ = _compute_log_odds_inv_means_sums( self.means_) self.weights_ = best_params['weights'] return self
def unbounded_bivariate_normal_integral(rho, xl, yl): """Computes the unbounded bivariate normal integral. Computes the probability that ``X>=xl and Y>=yl`` where X and Y are jointly Gaussian random variables, with mean ``[0., 0.]`` and covariance matrix ``[[1., rho], [rho, 1.]]``. Note: to compute the probability that ``X < xl and Y < yl``, use ``unbounded_bivariate_normal_integral(rho, -xl, -yl)``. Inputs: :rho: Correlation coefficient of the bivariate normal random variable :xl, yl: Lower bounds of the integral Ported from a Matlab implementation by Alan Genz which, in turn, is based on the method described by Drezner, Z and G.O. Wesolowsky, (1989), On the computation of the bivariate normal inegral, Journal of Statist. Comput. Simul. 35, pp. 101-107, Copyright statement of Alan Genz's version: *************** Copyright (C) 2013, Alan Genz, All rights reserved. Redistribution and use in source and binary forms, with or without modification, are permitted provided the following conditions are met: - Redistributions of source code must retain the above copyright notice, this list of conditions and the following disclaimer. - Redistributions in binary form must reproduce the above copyright notice, this list of conditions and the following disclaimer in the documentation and/or other materials provided with the distribution. - The contributor name(s) may not be used to endorse or promote products derived from this software without specific prior written permission. THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.""" rho = max(-1., min(1., rho)) if np.isposinf(xl) or np.isposinf(yl): return 0. elif np.isneginf(xl): return 1. if np.isneginf(yl) else _cdf(-yl) elif np.isneginf(yl): return _cdf(-xl) elif rho == 0: return _cdf(-xl)*_cdf(-yl) tp = 2.*np.pi h, k = xl, yl hk = h*k bvn = 0. if np.abs(rho) < 0.3: # Gauss Legendre points and weights, n = 6 w = np.array([0.1713244923791705, 0.3607615730481384, 0.4679139345726904]) x = np.array([0.9324695142031522, 0.6612093864662647, 0.2386191860831970]) elif np.abs(rho) < 0.75: # Gauss Legendre points and weights, n = 12 w = np.array([0.04717533638651177, 0.1069393259953183, 0.1600783285433464, 0.2031674267230659, 0.2334925365383547, 0.2491470458134029]) x = np.array([0.9815606342467191, 0.9041172563704750, 0.7699026741943050, 0.5873179542866171, 0.3678314989981802, 0.1252334085114692]) else: # Gauss Legendre points and weights, n = 20 w = np.array([.01761400713915212, .04060142980038694, .06267204833410906, .08327674157670475, 0.1019301198172404, 0.1181945319615184, 0.1316886384491766, 0.1420961093183821, 0.1491729864726037, 0.1527533871307259]) x = np.array([0.9931285991850949, 0.9639719272779138, 0.9122344282513259, 0.8391169718222188, 0.7463319064601508, 0.6360536807265150, 0.5108670019508271, 0.3737060887154196, 0.2277858511416451, 0.07652652113349733]) w = np.tile(w, 2) x = np.concatenate([1.-x, 1.+x]) if np.abs(rho) < 0.925: hs = .5 * (h*h + k*k) asr = .5*np.arcsin(rho) sn = np.sin(asr*x) bvn = np.dot(w, np.exp((sn*hk-hs)/(1.-sn**2))) bvn = bvn*asr/tp + _cdf(-h)*_cdf(-k) else: if rho < 0.: k = -k hk = -hk if np.abs(rho) < 1.: ass = 1.-rho**2 a = np.sqrt(ass) bs = (h-k)**2 asr = -.5*(bs/ass + hk) c = (4.-hk)/8. d = (12.-hk)/80. if asr > -100.: bvn = a*np.exp(asr)*(1.-c*(bs-ass)*(1.-d*bs)/3. + c*d*ass**2) if hk > -100.: b = np.sqrt(bs) sp = np.sqrt(tp)*_cdf(-b/a) bvn = bvn - np.exp(-.5*hk)*sp*b*(1. - c*bs*(1.-d*bs)/3.) a = .5*a xs = (a*x)**2 asr = -.5*(bs/xs + hk) inds = [i for i, asr_elt in enumerate(asr) if asr_elt>-100.] xs = xs[inds] sp = 1. + c*xs*(1.+5.*d*xs) rs = np.sqrt(1.-xs) ep = np.exp(-.5*hk*xs / (1.+rs)**2)/rs bvn = (a*np.dot(np.exp(asr[inds])*(sp-ep), w[inds]) - bvn)/tp if rho > 0: bvn += _cdf(-max(h, k)) elif h >= k: bvn = -bvn else: if h < 0.: L = _cdf(k)-_cdf(h) else: L = _cdf(-h)-_cdf(-k) bvn = L - bvn return max(0., min(1., bvn))
df.set_index('date', inplace=True) print(' Data found') # Save raw data outfile_raw = station_name.replace( ' ', '_') + '_raw_flow_data_' + sd.replace('-', '') + '_' + ed.replace( '-', '') + '.csv' df.to_csv(os.path.join(outfolder_raw, outfile_raw)) print(' Raw data saved to ' + outfile_raw) # Calculate variables (logflow1, logflow2, logflow3) df_vars = pd.DataFrame(index=df.index) for i in range(0, 3): # logflow1 - logflow3 df_vars['logflow' + str(i + 1)] = round( np.log10(df['flow'].shift(i + 1, freq='D').astype(float)), 5) df_vars['logflow' + str(i + 1)][np.isneginf( df_vars['logflow' + str(i + 1)])] = round(np.log10(0.005), 5) # Save file to directory outfile = station_name.replace(' ', '_') + '_Flow_Variables_' + sd.replace( '-', '') + '_' + ed.replace('-', '') + '.csv' df_vars.to_csv(os.path.join(outfolder, outfile)) print(' Flow variables calculated and saved to ' + outfile) # Summary of data missing = len(pd.DatetimeIndex(freq='D', start=sd, end=ed)) - len(df_vars) sum_dict = { 'ID': station_no, 'Start Date': str(df_vars.index[0].date()), 'End Date': str(df_vars.index[-1].date()), 'Missing Days': missing