def fit(self, chips, specs=None, n_samples=0, counts=None, a=0, bounds=None): """Primary execution point. Run either standard KDE or class-membership based KDE. If any of the class-membership based KDE arguments are set, it will be run instead of standard KDE. Parameters ---------- chips : list A list of chip model objects. n_samples : int The number of samples to generate. specs : qikify.models.Specs, optional If using partitioned sampling, boundaries defining pass / critical / fail subspaces must be provided. counts : dict, optional If using partitioned sampling, counts dictionary must be provided, with three keys: nGood, nCritical, nFail. """ X = pandas.DataFrame([chip.LCT for chip in chips]) self.n, self.d = X.shape self.specs = specs self.columns = getattr(X, 'columns', None) # Normalize data/bounds self.scale_factors, self.Xn = standardize(X) self.bounds = standardize(np.array([X.min(0), X.max(0)]), \ self.scale_factors) if bounds is not None: self.bounds = standardize(bounds, self.scale_factors) # Select bandwidth for Epanechnikov kernel (Rule of Thumb, see # Silverman, p.86) self.bandwidth = 0.8 # Magic number, default bandwidth scaling factor self.c_d = 2.0 * pow( np.pi, (self.d/2.0) ) / \ ( self.d * gamma(self.d/2) ) self.h = self._compute_h(self.n, self.d, self.c_d, self.bandwidth) self._set_bandwith_factors(a) # Generate samples if counts is None: return self._gen_samples(n_samples) else: print 'KDE: Running on dataset of size n: %d d: %d and \ generating %d samples.' % (self.n, self.d, sum( counts.values())) self._gen_spec_limits(X) return self._gen_partitioned_samples(counts)
def fit(self, chips, specs = None, n_samples = 0, counts = None, a = 0, bounds = None): """Primary execution point. Run either standard KDE or class-membership based KDE. If any of the class-membership based KDE arguments are set, it will be run instead of standard KDE. Parameters ---------- chips : list A list of chip model objects. n_samples : int The number of samples to generate. specs : qikify.models.Specs, optional If using partitioned sampling, boundaries defining pass / critical / fail subspaces must be provided. counts : dict, optional If using partitioned sampling, counts dictionary must be provided, with three keys: nGood, nCritical, nFail. """ X = pandas.DataFrame([chip.LCT for chip in chips]) self.n, self.d = X.shape self.specs = specs self.columns = getattr(X, 'columns', None) # Normalize data/bounds self.scale_factors, self.Xn = standardize(X) self.bounds = standardize(np.array([X.min(0), X.max(0)]), \ self.scale_factors) if bounds is not None: self.bounds = standardize(bounds, self.scale_factors) # Select bandwidth for Epanechnikov kernel (Rule of Thumb, see # Silverman, p.86) self.bandwidth = 0.8 # Magic number, default bandwidth scaling factor self.c_d = 2.0 * pow( np.pi, (self.d/2.0) ) / \ ( self.d * gamma(self.d/2) ) self.h = self._compute_h(self.n, self.d, self.c_d, self.bandwidth) self._set_bandwith_factors(a) # Generate samples if counts is None: return self._gen_samples(n_samples) else: print 'KDE: Running on dataset of size n: %d d: %d and \ generating %d samples.' % (self.n, self.d, sum(counts.values())) self._gen_spec_limits(X) return self._gen_partitioned_samples(counts)
def fit(self, chips): """Run Laplacian Score Feature Selection. .. note:: Eventually, it'd be nice to maintain col names with Xin so that we can add a plot method to plot scores vs. column names. Notes ----- This code is based on the definition from the paper [1]_: .. \\frac{\sum_{ij} (f_r^i - f_r^j) * S_{ij}}{sigma_2} .. [1] He, X. and Cai, D. and Niyogi, P., "Laplacian Score for Feature Selection", NIPS 2005. Parameters ---------- chips : list A list of chip objects """ X = np.array([chip.LCT.values() for chip in chips]) gnd = np.array([chip.gnd for chip in chips]) assert X.shape[0] == len(gnd), \ "Data and gnd do not have matching sizes" _, X = standardize(X) # Per LSFS paper, S_ij = exp(-||x_i - x_j||^2 / t). I've found that # t = ncol(X) to be a suitable choice; anything on that order should # work just fine. S = self._construct_w(X, gnd, t=X.shape[1]) D = sum(S, 1) dot_d_x = np.dot(D, X) z = (dot_d_x * dot_d_x) / sum(D) dprime = sum(np.dot(X.T, np.diag(D)).T * X, 0) - z lprime = sum(np.dot(X.T, S).T * X, 1) - z # Remove trivial solutions dprime[dprime < 1e-12] = np.inf # Compute and retain Laplacian scores and rankings self.scores = (lprime/dprime).T self.ranking = np.argsort(-self.scores) del S # Clean up to save memory return self
def fit(self, chips): """Run Laplacian Score Feature Selection. .. note:: Eventually, it'd be nice to maintain col names with Xin so that we can add a plot method to plot scores vs. column names. Notes ----- This code is based on the definition from the paper [1]_: .. \\frac{\sum_{ij} (f_r^i - f_r^j) * S_{ij}}{sigma_2} .. [1] He, X. and Cai, D. and Niyogi, P., "Laplacian Score for Feature Selection", NIPS 2005. Parameters ---------- chips : list A list of chip objects """ X = np.array([chip.LCT.values() for chip in chips]) gnd = np.array([chip.gnd for chip in chips]) assert X.shape[0] == len(gnd), \ "Data and gnd do not have matching sizes" _, X = standardize(X) # Per LSFS paper, S_ij = exp(-||x_i - x_j||^2 / t). I've found that # t = ncol(X) to be a suitable choice; anything on that order should # work just fine. S = self._construct_w(X, gnd, t=X.shape[1]) D = sum(S, 1) dot_d_x = np.dot(D, X) z = (dot_d_x * dot_d_x) / sum(D) dprime = sum(np.dot(X.T, np.diag(D)).T * X, 0) - z lprime = sum(np.dot(X.T, S).T * X, 1) - z # Remove trivial solutions dprime[dprime < 1e-12] = np.inf # Compute and retain Laplacian scores and rankings self.scores = (lprime / dprime).T self.ranking = np.argsort(-self.scores) del S # Clean up to save memory return self
def _gen_partitioned_samples(self, counts): """Generates nCritical critical devices, nGood good devices, nFail failing devices, with each region defined by specs.inner / specs.outer. """ # Initialize arrays for speed Sg, Sc, Sf = np.zeros((counts['nGood'], self.d)), \ np.zeros((counts['nCritical'], self.d)), \ np.zeros((counts['nFail'], self.d)) ng, nc, nf = 0, 0, 0 thresh = 0.02 while (ng + nc + nf < sum(counts.values())): sample = standardize(self._gen_sample(), \ self.scale_factors, reverse = True) if self._is_good(sample) and ng < counts['nGood']: Sg[ng, :] = sample ng += 1 if self._is_failing(sample) and nf < counts['nFail']: Sf[nf, :] = sample nf += 1 if self._is_critical(sample) and nc < counts['nCritical']: Sc[nc, :] = sample nc += 1 # Prints # generated in each category so we can monitor progress, # since this can take a while :) if float(ng + nc + nf) / sum(counts.values()) > thresh: print 'Ng:%i/%i Nc:%i/%i Nf:%i/%i' % \ (ng, counts['nGood'], \ nc, counts['nCritical'], \ nf, counts['nFail']) thresh += 0.02 print 'Non-parametric density estimation sampling complete.' return pandas.DataFrame(np.vstack((Sc, Sg, Sf)), columns=self.columns)
def _gen_partitioned_samples(self, counts): """Generates nCritical critical devices, nGood good devices, nFail failing devices, with each region defined by specs.inner / specs.outer. """ # Initialize arrays for speed Sg, Sc, Sf = np.zeros((counts['nGood'], self.d)), \ np.zeros((counts['nCritical'], self.d)), \ np.zeros((counts['nFail'], self.d)) ng, nc, nf = 0, 0, 0 thresh = 0.02 while ( ng+nc+nf < sum(counts.values()) ): sample = standardize(self._gen_sample(), \ self.scale_factors, reverse = True) if self._is_good(sample) and ng < counts['nGood']: Sg[ng, :] = sample ng += 1 if self._is_failing(sample) and nf < counts['nFail']: Sf[nf, :] = sample nf += 1 if self._is_critical(sample) and nc < counts['nCritical']: Sc[nc, :] = sample nc += 1 # Prints # generated in each category so we can monitor progress, # since this can take a while :) if float(ng+nc+nf) / sum(counts.values()) > thresh: print 'Ng:%i/%i Nc:%i/%i Nf:%i/%i' % \ (ng, counts['nGood'], \ nc, counts['nCritical'], \ nf, counts['nFail']) thresh += 0.02 print 'Non-parametric density estimation sampling complete.' return pandas.DataFrame(np.vstack((Sc, Sg, Sf)), columns=self.columns)
def _gen_samples(self, n_samples): """Generate KDE samples. """ Sn = np.vstack([self._gen_sample() for _ in xrange(n_samples)]) sample = standardize(Sn, self.scale_factors, reverse=True) return pandas.DataFrame(sample, columns=self.columns)
def _gen_samples(self, n_samples): """Generate KDE samples. """ Sn = np.vstack([ self._gen_sample() for _ in xrange(n_samples) ]) sample = standardize(Sn, self.scale_factors, reverse = True) return pandas.DataFrame(sample, columns=self.columns)