def tau_filt(self, tau_d = None): if tau_d == None: tau_d = self._taus t_low = 0.001 t_hi = 0.01 median_taus = [] num_taus = [] proto_entry = tau_d[tau_d.keys()[0]][0] det_taus = {det:[] for det in range(len(proto_entry))} for k in tau_d: for taus in tau_d[k]: for det in range(len(taus)): det_taus[det] += [taus[det]] medians = [] means = [] var = [] iqr = [] for k in det_taus: medians += [np.median(det_taus[k])] means += [np.mean(det_taus[k])] var += [np.var(det_taus[k])] iqr += [stats.iqr(det_taus[k])] #t_low = np.array(means) - np.array(iqr) #t_hi = np.array(means) + np.array(iqr) t_low = np.array(means) - 4 * np.sqrt(np.array(var)) t_hi = np.array(means) + 4 * np.sqrt(np.array(var)) for k in tau_d: count = 0 for taus in tau_d[k]: b = (taus > t_low) * (taus < t_hi) self._cuts[k][count] *= b count += 1 return medians, means, var, iqr
def prepare_y_data(self, chunk_len): # update scaling stats self.stats_update_counter += chunk_len if self.stats_update_counter > self.n_samples//3: self.mean = np.nanmean(self.y_raw_buffer, 0) self.iqr = stats.iqr(self.y_raw_buffer, 0, rng=(0, 100), nan_policy='omit') self.iqr[self.iqr <=0 ] = 1 self.stats_update_counter = 0 # return scaled signals return ((self.y_raw_buffer - self.mean) / self.iqr)[:, self.c_slice]
def parse_kallisto_stats(abundance): import numpy as np stats = dict() df = pd.read_table(abundance, sep="\t") stats['transcripts'] = df.shape[0] stats['zero-count_transcripts'] = (df['est_counts'] == 0).sum() stats['non-zero-count_transcripts'] = (df['est_counts'] > 0).sum() log_tpm = np.log2(1 + df['tpm']) stats['log2tpm_mean'] = log_tpm.mean() stats['log2tpm_median'] = log_tpm.median() p_log_tpm = np.log2(1 + df['tpm'].where(lambda x: x > 0)).dropna() stats['non-zero_log2tpm_mean'] = p_log_tpm.mean() stats['non-zero_log2tpm_median'] = p_log_tpm.median() try: from scipy.stats import iqr stats['log2tpm_iqr'] = iqr(log_tpm) stats['non-zero_log2tpm_iqr'] = iqr(p_log_tpm) except ImportError: stats['log2tpm_iqr'] = np.nan stats['non-zero_log2tpm_iqr'] = np.nan return stats
def get_outliers(self, data, thr): """Detects outlier points based on chosen method and theshold.""" # get outlier threshold iqr = stats.iqr(data, nan_policy='omit') # inter-quartile range bound = thr * iqr # multiple of IQR high_bound = np.percentile(data[~np.isnan(data)], 75) + bound low_bound = np.percentile(data[~np.isnan(data)], 25) - bound # compare data to bounds old_settings = np.seterr(invalid='ignore') outliers = np.where((data < low_bound) | (data > high_bound))[0] np.seterr(**old_settings) return outliers
def interquartile_range(full_list_X, full_list_Y): rowx = list() rowy = list() iqrX = list() iqrY = list() for i in range(len(full_list_X)): x = full_list_X[i] rowx = (stats.iqr(x, axis=0)).tolist() temp = zip(*(full_list_Y[i])) y1 = list(temp[0]) iqrY.append(y1[0]) iqrX.append(rowx) rowx = list() rowy = list() return iqrX, iqrY
def add_features_in_group(features, gr_, feature_name, aggs, prefix): for agg in aggs: if agg == 'sum': features['{}{}_sum'.format(prefix, feature_name)] = gr_[feature_name].sum() elif agg == 'mean': features['{}{}_mean'.format(prefix, feature_name)] = gr_[feature_name].mean() elif agg == 'max': features['{}{}_max'.format(prefix, feature_name)] = gr_[feature_name].max() elif agg == 'min': features['{}{}_min'.format(prefix, feature_name)] = gr_[feature_name].min() elif agg == 'std': features['{}{}_std'.format(prefix, feature_name)] = gr_[feature_name].std() elif agg == 'count': features['{}{}_count'.format(prefix, feature_name)] = gr_[feature_name].count() elif agg == 'skew': features['{}{}_skew'.format(prefix, feature_name)] = skew(gr_[feature_name]) elif agg == 'kurt': features['{}{}_kurt'.format(prefix, feature_name)] = kurtosis(gr_[feature_name]) elif agg == 'iqr': features['{}{}_iqr'.format(prefix, feature_name)] = iqr(gr_[feature_name]) elif agg == 'median': features['{}{}_median'.format(prefix, feature_name)] = gr_[feature_name].median() return features
def epps_singleton_2samp(x, y, t=(0.4, 0.8)): """ Compute the Epps-Singleton (ES) test statistic. Test the null hypothesis that two samples have the same underlying probability distribution. Parameters ---------- x, y : array-like The two samples of observations to be tested. Input must not have more than one dimension. Samples can have different lengths. t : array-like, optional The points (t1, ..., tn) where the empirical characteristic function is to be evaluated. It should be positive distinct numbers. The default value (0.4, 0.8) is proposed in [1]_. Input must not have more than one dimension. Returns ------- statistic : float The test statistic. pvalue : float The associated p-value based on the asymptotic chi2-distribution. See Also -------- ks_2samp, anderson_ksamp Notes ----- Testing whether two samples are generated by the same underlying distribution is a classical question in statistics. A widely used test is the Kolmogorov-Smirnov (KS) test which relies on the empirical distribution function. Epps and Singleton introduce a test based on the empirical characteristic function in [1]_. One advantage of the ES test compared to the KS test is that is does not assume a continuous distribution. In [1]_, the authors conclude that the test also has a higher power than the KS test in many examples. They recommend the use of the ES test for discrete samples as well as continuous samples with at least 25 observations each, whereas `anderson_ksamp` is recommended for smaller sample sizes in the continuous case. The p-value is computed from the asymptotic distribution of the test statistic which follows a `chi2` distribution. If the sample size of both `x` and `y` is below 25, the small sample correction proposed in [1]_ is applied to the test statistic. The default values of `t` are determined in [1]_ by considering various distributions and finding good values that lead to a high power of the test in general. Table III in [1]_ gives the optimal values for the distributions tested in that study. The values of `t` are scaled by the semi-interquartile range in the implementation, see [1]_. References ---------- .. [1] T. W. Epps and K. J. Singleton, "An omnibus test for the two-sample problem using the empirical characteristic function", Journal of Statistical Computation and Simulation 26, p. 177--203, 1986. .. [2] S. J. Goerg and J. Kaiser, "Nonparametric testing of distributions - the Epps-Singleton two-sample test using the empirical characteristic function", The Stata Journal 9(3), p. 454--465, 2009. """ x, y, t = np.asarray(x), np.asarray(y), np.asarray(t) # check if x and y are valid inputs if x.ndim > 1: raise ValueError('x must be 1d, but x.ndim equals {}.'.format(x.ndim)) if y.ndim > 1: raise ValueError('y must be 1d, but y.ndim equals {}.'.format(y.ndim)) nx, ny = len(x), len(y) if (nx < 5) or (ny < 5): raise ValueError('x and y should have at least 5 elements, but len(x) ' '= {} and len(y) = {}.'.format(nx, ny)) if not np.isfinite(x).all(): raise ValueError('x must not contain nonfinite values.') if not np.isfinite(y).all(): raise ValueError('y must not contain nonfinite values.') n = nx + ny # check if t is valid if t.ndim > 1: raise ValueError('t must be 1d, but t.ndim equals {}.'.format(t.ndim)) if np.less_equal(t, 0).any(): raise ValueError('t must contain positive elements only.') # rescale t with semi-iqr as proposed in [1]; import iqr here to avoid # circular import from scipy.stats import iqr sigma = iqr(np.hstack((x, y))) / 2 ts = np.reshape(t, (-1, 1)) / sigma # covariance estimation of ES test gx = np.vstack((np.cos(ts*x), np.sin(ts*x))).T # shape = (nx, 2*len(t)) gy = np.vstack((np.cos(ts*y), np.sin(ts*y))).T cov_x = np.cov(gx.T, bias=True) # the test uses biased cov-estimate cov_y = np.cov(gy.T, bias=True) est_cov = (n/nx)*cov_x + (n/ny)*cov_y est_cov_inv = np.linalg.pinv(est_cov) r = np.linalg.matrix_rank(est_cov_inv) if r < 2*len(t): warnings.warn('Estimated covariance matrix does not have full rank. ' 'This indicates a bad choice of the input t and the ' 'test might not be consistent.') # see p. 183 in [1]_ # compute test statistic w distributed asympt. as chisquare with df=r g_diff = np.mean(gx, axis=0) - np.mean(gy, axis=0) w = n*np.dot(g_diff.T, np.dot(est_cov_inv, g_diff)) # apply small-sample correction if (max(nx, ny) < 25): corr = 1.0/(1.0 + n**(-0.45) + 10.1*(nx**(-1.7) + ny**(-1.7))) w = corr * w p = chi2.sf(w, r) return Epps_Singleton_2sampResult(w, p)
sample[2] = np.random.laplace(0, scale_laplace, sample_size) sample[3] = np.random.standard_cauchy(size=sample_size) sample[4] = np.concatenate([ np.random.standard_normal(size=mixed_size_a), np.random.normal(0, scale_mixed, mixed_size_b) ]) for i in range(5): sample[i] = np.sort(sample[i]) # Calculate all sums of characteristics of samples for i in range(5): mean_square_dev[i] += mean_square(sample[i]) average_absolute_dev[i] += average_absolute(sample[i]) average_range[i] += av_range(sample[i]) inter_quartile_range[i] += stats.iqr(sample[i]) median_absolute_dev[i] += median_absolute_deviation(sample[i]) mean_square_dev_square[i] += pow(mean_square(sample[i]), 2) average_absolute_dev_square[i] += pow(average_absolute(sample[i]), 2) average_range_square[i] += pow(av_range(sample[i]), 2) inter_quartile_range_square[i] += pow(stats.iqr(sample[i]), 2) median_absolute_dev_square[i] += pow( median_absolute_deviation(sample[i]), 2) print_mean_result('s', mean_square_dev, mean_square_dev_square) print_mean_result('d', average_absolute_dev, average_absolute_dev_square) print_mean_result('R', average_range, average_range_square) print_mean_result('IQR', inter_quartile_range, inter_quartile_range_square) print_mean_result('MAD', median_absolute_dev, median_absolute_dev_square)
def rmse(self, ground_truth, simulation, join='inner', fill_value=0, relative=False, cumulative=False, normed=False): """ Metric: rmse Description: Root mean squared error Inputs: ground_truth - ground truth measurement (data frame) with measurement in the "value" column simulation - simulation measurement (data frame) with measurement in the "value" column join - type of join to perform between ground truth and simulation fill_value - fill value for non-overlapping joins """ if type(ground_truth) is np.ndarray: result = ground_truth - simulation result = (result ** 2).mean() result = np.sqrt(result) return result if type(ground_truth) is list: ground_truth = np.nan_to_num(ground_truth) simulation = np.nan_to_num(simulation) result = np.asarray(ground_truth) - np.asarray(simulation) result = (result ** 2).mean() result = np.sqrt(result) return result df = self.join_dfs(ground_truth, simulation, join=join, fill_value=fill_value) if len(df.index) > 0: if cumulative: df['value_sim'] = df['value_sim'].cumsum() df['value_gt'] = df['value_gt'].cumsum() if normed: epsilon = 0.001*df[df['value_gt'] != 0.0]['value_gt'].min() df['value_sim'] = (df['value_sim'] + epsilon)/(df['value_sim'].max() + epsilon) df['value_gt'] = (df['value_gt'] + epsilon)/(df['value_gt'].max() + epsilon) if not relative: return np.sqrt(((df["value_sim"]-df["value_gt"])**2).mean()) else: iq_range = float(iqr(df['value_gt'].values)) result = df["value_sim"]-df["value_gt"] result = (result ** 2).mean() result = np.sqrt(result) if iq_range > 0: result = result / iq_range else: mean_value = df['value_gt'].mean() if mean_value > 0: result = result / mean_value else: return None return result else: return None
A_po = evok_po.data A_pr = evok_pr.data # compute cohen d M1 = np.mean(A_po, axis=1) M2 = np.mean(A_pr, axis=1) std1 = np.std(A_po, axis=1) std2 = np.std(A_pr, axis=1) n1 = A_po.shape[1] n2 = A_pr.shape[1] std = np.sqrt(np.divide((n1-1)*std1**2+(n2-1)*std2**2,(n1+n2-2))) cohen = np.divide(M1-M2, std) # Compute number of bins iqr = spstats.iqr(M1) n = M1.size maximum = np.max(M1) minimum = np.min(M1) h = 2*iqr/(n**(1/3)) nbin = (maximum - minimum)/h from numpy import inf M1[M1==-inf]=0 M2[M2 == -inf] =0 plt.hist(M2, bins = 80) plt.xlabel('Mean normalised Amplitude (dB)') plt.ylabel('Number of electrodes') plt.title('Prestimulus mean stimulus related HFB amplitude (-400 to -100 ms) ') plt.hist(M1, bins = 80) plt.xlabel('Mean normalised Amplitude (dB)')
#plot scatter graph plt.figure() plt.scatter(x, norm_y, color='black') plt.title("Values over Time to Identify Outliers") plt.xlabel("Data Reading (Time)") plt.ylabel("0-1 Normalised Value") plt.grid(True) #compute averages len = len(y) mean = np.mean(y) median = np.median(y) mode = stats.mode(y)[0][0] mode_count = stats.mode(y)[1][0] range = np.max(y) - np.min(y) iqrange = stats.iqr(y) std_dev = np.std(y) z_score = stats.zscore(y) std_err = stats.sem(y) con_inter = stats.bayes_mvs(y, alpha=0.95) #95% confidence interval for mean, var, and std reported as (center, (lower, upper)) #plot z scores scatter graph plt.figure() plt.scatter(x, z_score, color='black') plt.title("Z-Scores of Data Points") plt.xlabel("Data Reading (Time)") plt.ylabel("Z-Score") plt.grid(True) #print descriptive statistics
def get_textual_metadata(annotated_content, size_kb, wsdir, master, idno_file): root_document = etree.parse(wsdir + master + idno_file).getroot() specific_namespaces = { 'tei': 'http://www.tei-c.org/ns/1.0', 'xi': 'http://www.w3.org/2001/XInclude', 'cligs': 'https://cligs.hypotheses.org/ns/cligs' } chapters = root_document.xpath("//tei:body//tei:div[@type='chapter']", namespaces=specific_namespaces) len_chapters = [] for chapter in chapters: len_chapters.append( len(" ".join( chapter.xpath(".//text()", namespaces=specific_namespaces)))) len_chapters = np.array(len_chapters) text_measures = "" text_measures = text_measures + '\n\t\t\t\t<measure unit="chapters.len.mean">' + str( "%.2f" % round(len_chapters.mean(), 2)) + r'</measure>' text_measures = text_measures + '\n\t\t\t\t<measure unit="chapters.len.std">' + str( "%.2f" % round(len_chapters.std(), 2)) + r'</measure>' text_measures = text_measures + '\n\t\t\t\t<measure unit="chapters.len.median">' + str( "%.2f" % round(np.percentile(len_chapters, q=50), 2)) + r'</measure>' text_measures = text_measures + '\n\t\t\t\t<measure unit="chapters.len.iqr">' + str( "%.2f" % round(stats.iqr(len_chapters), 2)) + r'</measure>' content_abstract = re.findall(r'<abstract.*?>(.*?)</abstract>', annotated_content, flags=re.DOTALL)[0] content_abstract = re.sub(r'</?.*?>', r'', content_abstract, flags=re.DOTALL) content_abstract = re.sub(r'\s\s+', r' ', content_abstract) len_abstract = str(len(content_abstract)) annotated_content = re.sub(r'<teiHeader>.*?</teiHeader>', r'', annotated_content, flags=re.DOTALL) # Divs and groups of lines are counted divs = str(annotated_content.count("<div")) lines = str(len(re.findall(r'\n+', annotated_content))) # Diferent TEI elements are counted chapters = str( len(re.findall(r'<div[^>]*?type="chapter"', annotated_content))) short_stories = str( len(re.findall(r'<div[^>]*?type="shortStories"', annotated_content))) parts = str(len(re.findall(r'<div[^>]*?type="part"', annotated_content))) sections = str( len(re.findall(r'<div[^>]*?type="section"', annotated_content))) divisions = str( len(re.findall(r'<div[^>]*?type="division"', annotated_content))) blocks = str( len( re.findall(r'<(l|ab|head|stage|sp|p|ab)( .+?|)>', annotated_content))) line_verses = str(len(re.findall(r'<(l)( .+?|)>', annotated_content))) heads = str(len(re.findall(r'<(head)( .+?|)>', annotated_content))) stages = str(len(re.findall(r'<(stage)( .+?|)>', annotated_content))) sps = str(len(re.findall(r'<(sp)( .+?|)>', annotated_content))) ps = str(len(re.findall(r'<(p)( .+?|)>', annotated_content))) abs_ = str(len(re.findall(r'<(ab)( .+?|)>', annotated_content))) lg_poems = str(len(re.findall(r'<lg type="poem">', annotated_content))) lg_stanzas = str(len(re.findall(r'<lg type="stanza">', annotated_content))) ft = str(len(re.findall(r'<(floatingText)( .+?|)>', annotated_content))) punctual_ss = str( len(re.findall(r'<milestone unit="s"/>', annotated_content))) # The paragraphas that have right after a punctuation mark that presents direct speech are counted saids = str(len(re.findall(r'<said>', annotated_content))) speech_ps = str( len(re.findall(r'<p rend="direct-speech">', annotated_content))) narrative_ps = str(len(re.findall(r'<p>', annotated_content))) # Then the text is converted into plaintext and the white space cleaned plain_body = annotated_content plain_body = re.sub(r'</?.*?>', r'', plain_body, flags=re.DOTALL) plain_body = re.sub(r'[\t ]+', r' ', plain_body) plain_body = re.sub(r'\n[\n]+', r'\n', plain_body) # Characters and words are counted chars = str(len(plain_body)) tokens = str(len(re.findall(r'[\wáéíóúñü\d]+', plain_body))) # If we want some more info, the ammount of numbers and punctuation marks are counted numerals = str(len(re.findall(r'\d+', plain_body))) puncts = str( len( re.findall(r'[!"\#$%&\'()*+,\-./:;<=>?@\[\\\]^_`{|}~¿¡…—–~»«]', plain_body))) textual_metadata = r'\n\t\t\t\t<measure unit="lines">' + re.escape( lines) + r'</measure>\n\t\t\t\t<measure unit="divs">' + re.escape( divs ) + r'</measure>\n\t\t\t\t<measure unit="tokens">' + re.escape( tokens ) + r'</measure>\n\t\t\t\t<measure unit="chars">' + re.escape( chars ) + r'</measure>\n\t\t\t\t<measure unit="size_kb">' + re.escape( size_kb ) + r'</measure>\n\t\t\t\t<measure unit="chapters">' + re.escape( chapters ) + r'</measure>\n\t\t\t\t<measure unit="shortStories">' + re.escape( short_stories ) + r'</measure>\n\t\t\t\t<measure unit="parts">' + re.escape( parts ) + r'</measure>\n\t\t\t\t<measure unit="sections">' + re.escape( sections ) + r'</measure>\n\t\t\t\t<measure unit="divisions">' + re.escape( divisions ) + r'</measure> \n\t\t\t\t<measure unit="blocks">' + re.escape( blocks ) + r'</measure> \n\t\t\t\t<measure unit="lg.poems">' + re.escape( lg_poems ) + r'</measure> \n\t\t\t\t<measure unit="lg.stanzas">' + re.escape( lg_stanzas ) + r'</measure> \n\t\t\t\t<measure unit="line.verses">' + re.escape( line_verses ) + r'</measure> \n\t\t\t\t<measure unit="heads">' + re.escape( heads ) + r'</measure> \n\t\t\t\t<measure unit="stages">' + re.escape( stages ) + r'</measure> \n\t\t\t\t<measure unit="sps">' + re.escape( sps ) + r'</measure> \n\t\t\t\t<measure unit="paragraphs">' + re.escape( ps ) + r'</measure> \n\t\t\t\t<measure unit="abs">' + re.escape( abs_ ) + r'</measure> \n\t\t\t\t<measure unit="fts">' + re.escape( ft ) + r'</measure>\n\t\t\t\t<measure unit="paragraphs.ds">' + re.escape( speech_ps ) + r'</measure>\n\t\t\t\t<measure unit="saids">' + re.escape( saids ) + r'</measure>\n\t\t\t\t<measure unit="narrative.ps">' + re.escape( narrative_ps ) + r'</measure>\n\t\t\t\t<measure unit="punctual_ss">' + re.escape( punctual_ss ) + r'</measure> \n\t\t\t\t<measure unit="numerals">' + re.escape( numerals ) + r'</measure> \n\t\t\t\t<measure unit="puncts">' + re.escape( puncts ) + r'</measure> \n\t\t\t\t<measure unit="len.abstract">' + re.escape( len_abstract) + r'</measure>' + text_measures return textual_metadata
def run(input, mask, outputfile, verbose, dimensions, svdradius, haralickwindow, binsize,label, extendstats): """CoLlAGe captures subtle anisotropic differences in disease pathologies by measuring entropy of co-occurrences of voxel-level gradient orientations on imaging computed within a local neighborhood.""" if input.endswith('.csv'): header = ['ID', 'Image', 'Mask', 'svdradius', 'haralickwindow', 'binsize', 'label'] features_list = [] list_failed_cases = [['ID', 'Image', 'Mask', 'Error']] if dimensions == 2: suffix = '' for feature in collageradiomics.HaralickFeature: if extendstats: features_list.extend(['Collage'+feature.name+'Median'+suffix, 'Collage'+feature.name+'IQR'+suffix, 'Collage'+feature.name+'Skewness'+suffix, 'Collage'+feature.name+'Kurtosis'+suffix, 'Collage'+feature.name+'Mean'+suffix, 'Collage'+feature.name+'Variance'+suffix]) else: features_list.extend(['Collage'+feature.name+'Median'+suffix, 'Collage'+feature.name+'Skewness'+suffix, 'Collage'+feature.name+'Kurtosis'+suffix, 'Collage'+feature.name+'Variance'+suffix]) header.append(features_list) output_list = [header] else: for suffix in ['Theta', 'Phi']: for feature in collageradiomics.HaralickFeature: if extendstats: features_list.extend(['Collage'+feature.name+'Median'+suffix, 'Collage'+feature.name+'IQR'+suffix, 'Collage'+feature.name+'Skewness'+suffix, 'Collage'+feature.name+'Kurtosis'+suffix, 'Collage'+feature.name+'Mean'+suffix, 'Collage'+feature.name+'Variance'+suffix]) else: features_list.extend(['Collage'+feature.name+'Median'+suffix, 'Collage'+feature.name+'Skewness'+suffix, 'Collage'+feature.name+'Kurtosis'+suffix, 'Collage'+feature.name+'Variance'+suffix]) header.extend(features_list) output_list = [header] with open(input, newline='') as csvfile: reader = csv.DictReader(csvfile) for row in reader: output_case = [] try: case_id = row['ID'] image_filepath = row['Image'] mask_filepath = row['Mask'] image = sitk.ReadImage(image_filepath) mask = sitk.ReadImage(mask_filepath) output_case.extend([case_id, image_filepath, mask_filepath, svdradius, haralickwindow, binsize, label]) # Check if user wants to select single label from the mask if label != -1: mask = sitk.BinaryThreshold(mask, lowerThreshold = label, upperThreshold = label, insideValue = 1, outsideValue = 0) image_array = sitk.GetArrayFromImage(image) mask_array = sitk.GetArrayFromImage(mask) # Collage is expecting array with x,y,z but sitk.GetArrayFromImage as z,y,x, so x show be swapped by z if dimensions != 2: image_array = np.swapaxes(image_array,0,2) mask_array = np.swapaxes(mask_array,0,2) # Remove any extra array dimensions if the user explicitly asks for 2D. if dimensions == 2: image_array = image_array[:,:,0] mask_array = mask_array [:,:,0] collage = collageradiomics.Collage( image_array, mask_array, svd_radius=svdradius, verbose_logging=verbose, num_unique_angles=binsize) collage.execute() for feature in collageradiomics.HaralickFeature: feature_output = collage.get_single_feature_output(feature) if image_array.ndim == 2: feature_output = feature_output[~np.isnan(feature_output)] # NumPy supports median natively, we'll use that. median = np.nanmedian(feature_output, axis=None) # Use SciPy for kurtosis, variance, and skewness. feature_stats = stats.describe(feature_output, axis=None) if extendstats: mean = feature_stats.mean #np.nanmean(feature_output, axis=None) iqr = stats.iqr(feature_output) output_case.extend([median, iqr, feature_stats.skewness, feature_stats.kurtosis, feature_stats.mean, feature_stats.variance]) else: output_case.extend([median, feature_stats.skewness, feature_stats.kurtosis, feature_stats.variance]) else: # Extract phi and theta angles. feature_output_theta = feature_output[:,:,:,0] feature_output_phi = feature_output[:,:,:,1] # Remove NaN for stat calculations. feature_output_theta = feature_output_theta[~np.isnan(feature_output_theta)] feature_output_phi = feature_output_phi[~np.isnan(feature_output_phi)] # NumPy supports median natively, we'll use that. median_theta = np.nanmedian(feature_output_theta, axis=None) median_phi = np.nanmedian(feature_output_phi, axis=None) # Use SciPy for kurtosis, variance, and skewness. feature_stats_theta = stats.describe(feature_output_theta.flatten(), axis=None) feature_stats_phi = stats.describe(feature_output_phi.flatten(), axis=None) if extendstats: mean_theta = feature_stats_theta.mean mean_phi = feature_stats_phi.mean iqr_theta = stats.iqr(feature_output_theta) iqr_phi = stats.iqr(feature_output_phi) output_case.extend([median_theta, iqr_theta, feature_stats_theta.skewness, feature_stats_theta.kurtosis, feature_stats_theta.mean, feature_stats_theta.variance, median_phi, iqr_phi, feature_stats_phi.skewness, feature_stats_phi.kurtosis, feature_stats_phi.mean, feature_stats_phi.variance]) else: output_case.extend([median_theta, feature_stats_theta.skewness, feature_stats_theta.kurtosis, feature_stats_theta.variance, median_phi, feature_stats_phi.skewness, feature_stats_phi.kurtosis, feature_stats_phi.variance]) output_list.append(output_case) except RuntimeError as err: list_failed_cases.append([case_id, image_filepath, mask_filepath, err]) except ValueError as err: list_failed_cases.append([case_id, image_filepath, mask_filepath, err]) # Create collage radiomic features output csv file with open(outputfile, 'w') as file: writer = csv.writer(file) writer.writerows(output_list) # Create errors output csv file with open(os.path.join(os.path.dirname(outputfile), 'errors_' + os.path.basename(outputfile)), 'w') as file: writer = csv.writer(file) writer.writerows(list_failed_cases) else: image = sitk.ReadImage(input) mask = sitk.ReadImage(mask) # Check if user wants to select single label from the mask if label != -1: mask = sitk.BinaryThreshold(mask, lowerThreshold = label, upperThreshold = label, insideValue = 1, outsideValue = 0) image_array = sitk.GetArrayFromImage(image) mask_array = sitk.GetArrayFromImage(mask) # Collage is expecting array with x,y,z but sitk.GetArrayFromImage as z,y,x, so x show be swapped by z if dimensions != 2: image_array = np.swapaxes(image_array,0,2) mask_array = np.swapaxes(mask_array,0,2) # Remove any extra array dimensions if the user explicitly asks for 2D. if dimensions == 2: image_array = image_array[:,:,0] mask_array = mask_array [:,:,0] collage = collageradiomics.Collage( image_array, mask_array, svd_radius=svdradius, verbose_logging=verbose, num_unique_angles=binsize) collage.execute() # Create a csv file at the passed in output file location. with open(outputfile, 'w', newline='') as csv_output_file: writer = csv.writer(csv_output_file) # Write the columns. writer.writerow(['FeatureName', 'Value']) for feature in collageradiomics.HaralickFeature: feature_output = collage.get_single_feature_output(feature) if image_array.ndim == 2: feature_output = feature_output[~np.isnan(feature_output)] # NumPy supports median natively, we'll use that. median = np.nanmedian(feature_output, axis=None) # Use SciPy for kurtosis, variance, and skewness. feature_stats = stats.describe(feature_output, axis=None) # Write CSV row for current feature. _write_csv_stats_row(writer, feature, median, feature_stats.skewness, feature_stats.kurtosis, feature_stats.variance) else: # Extract phi and theta angles. feature_output_theta = feature_output[:,:,:,0] feature_output_phi = feature_output[:,:,:,1] # Remove NaN for stat calculations. feature_output_theta = feature_output_theta[~np.isnan(feature_output_theta)] feature_output_phi = feature_output_phi[~np.isnan(feature_output_phi)] # NumPy supports median natively, we'll use that. median_theta = np.nanmedian(feature_output_theta, axis=None) median_phi = np.nanmedian(feature_output_phi, axis=None) # Use SciPy for kurtosis, variance, and skewness. feature_stats_theta = stats.describe(feature_output_theta.flatten(), axis=None) feature_stats_phi = stats.describe(feature_output_phi.flatten(), axis=None) if extendstats: mean_phi = feature_stats_phi.mean iqr_phi = stats.iqr(feature_output_phi.flatten()) mean_theta = feature_stats_theta.mean iqr_theta = stats.iqr(feature_output_theta.flatten()) _write_csv_extented_stats_row(writer, feature, median_theta, iqr_theta, feature_stats_theta.skewness, feature_stats_theta.kurtosis, mean_theta, feature_stats_theta.variance, 'Theta') _write_csv_extented_stats_row(writer, feature, median_phi, iqr_phi, feature_stats_phi.skewness, feature_stats_phi.kurtosis, mean_phi, feature_stats_phi.variance, 'Phi') else: # Write CSV rows for each angle. _write_csv_stats_row(writer, feature, median_theta, feature_stats_theta.skewness, feature_stats_theta.kurtosis, feature_stats_theta.variance, 'Theta') _write_csv_stats_row(writer, feature, median_phi, feature_stats_phi.skewness, feature_stats_phi.kurtosis, feature_stats_phi.variance, 'Phi')
recall = float(recall) # calculate F1 score if (precision + recall) != 0: f1 = (2 * precision * recall) / (precision + recall) f1_scores.append(f1) # Calculate cross-validation average print('\n-----------------------------------') print('sklearn.tree.DecisionTreeClassifier Model 1') print('\tFeatures: speed, X-accel, Y-accel, Z-accel, Z-jolt') print('\tLabels: speedbump (1 = yes, 0 = no)') print('\tAverage F1 score:', np.mean(f1_scores)) print('\tStdDev F1 score:', np.std(f1_scores)) print('\tMedian F1 score:', np.median(f1_scores)) print('\tIQR F1 score:', stats.iqr(f1_scores)) print('\tSkewness F1 score:', stats.skew(f1_scores)) print('\tZero F1 score:', f1_scores.count(0.00)) # # Decision Model Model 2 # # Separate Y and X variables # df_label = df.loc[:, 'speedbump'] # df_feature = df.loc[:, ('Speed', 'X', 'Y', 'Z')] # Y = df_label.as_matrix() # X = df_feature.as_matrix() # # # # Prepare for cross-validation # clf = DecisionTreeClassifier(random_state=0) # create a DecisionTreeClassifier # f1_scores = [] # sum of F1 scores # cv = 100 # number of cross-validations
def _demo_validate_data(): dim_x = 75 num_x_p = 500 num_x_n = 500 num_ch = 20 x_p_train = np.asarray( [np.random.randn(num_x_p, dim_x) for i in range(num_ch)]) x_n_train = np.array( [np.random.randn(num_x_p, dim_x) for i in range(num_ch)]) y_p_train = [1] * num_x_p y_n_train = [0] * num_x_n x_train = np.concatenate((x_n_train, x_p_train), axis=1) y_train = np.concatenate((y_n_train, y_p_train), axis=0) permutation = np.random.permutation(x_train.shape[1]) x_train = x_train[:, permutation, :] y_train = y_train[permutation] model = train_pca_rda_kde_model(x_train, y_train, k_folds=10) fig = plt.figure() ax = fig.add_subplot(211) x_plot = np.linspace(np.min(model.line_el[-1]), np.max(model.line_el[-1]), 1000)[:, np.newaxis] ax.plot(model.line_el[2][y_train == 0], -0.005 - 0.01 * np.random.random(model.line_el[2][y_train == 0].shape[0]), 'ro', label='class(-)') ax.plot(model.line_el[2][y_train == 1], -0.005 - 0.01 * np.random.random(model.line_el[2][y_train == 1].shape[0]), 'go', label='class(+)') for idx in range(len(model.pipeline[2].list_den_est)): log_dens = model.pipeline[2].list_den_est[idx].score_samples(x_plot) ax.plot(x_plot[:, 0], np.exp(log_dens), 'r-' * (idx == 0) + 'g-' * (idx == 1), linewidth=2.0) ax.legend(loc='upper right') plt.title('Training Data') plt.ylabel('p(e|l)') plt.xlabel('scores') # Test x_p_test = np.asarray( [np.random.randn(num_x_p, dim_x) for i in range(num_ch)]) x_n_test = np.array( [np.random.randn(num_x_p, dim_x) for i in range(num_ch)]) y_p_test = [1] * num_x_p y_n_test = [0] * num_x_n x_test = np.concatenate((x_n_test, x_p_test), axis=1) y_test = np.concatenate((y_n_test, y_p_test), axis=0) permutation = np.random.permutation(x_test.shape[1]) x_test = x_test[:, permutation, :] y_test = y_test[permutation] model.transform(x_test) ax.plot(model.line_el[2][y_test == 0], -0.01 - 0.01 * np.random.random(model.line_el[2][y_test == 0].shape[0]), 'bo', label='t_class(-)') ax.plot(model.line_el[2][y_test == 1], -0.01 - 0.01 * np.random.random(model.line_el[2][y_test == 1].shape[0]), 'ko', label='t_class(+)') bandwidth = 1.06 * min(np.std(model.line_el[2]), iqr(model.line_el[2]) / 1.34) * np.power( model.line_el[2].shape[0], -0.2) test_kde = KernelDensityEstimate(bandwidth=bandwidth) test_kde.fit(model.line_el[2], y_test) for idx in range(len(model.pipeline[2].list_den_est)): log_dens = test_kde.list_den_est[idx].score_samples(x_plot) ax.plot(x_plot[:, 0], np.exp(log_dens), 'b--' * (idx == 0) + 'k--' * (idx == 1), linewidth=2.0) ax.legend(loc='upper right') plt.title('Training Data') plt.ylabel('p(e|l)') plt.xlabel('scores') plt.show()
def Slice_Profile(IMG, results, options): """Extract a very basic SB profile along a line. A line of pixels can be identified by the user in image coordinates to extract an SB profile. Primarily intended for diagnostic purposes, this allows users to see very specific pixels. While this tool can be used for examining the disk structure (such as for edge on galaxies), users will likely prefer the more powerful :func:`~pipeline_steps.Axial_Profiles.Axial_Profiles` and :func:`~pipeline_steps.Radial_Profiles.Radial_Profiles` methods for such analysis. Parameters ----------------- ap_slice_anchor : dict, default None Coordinates for the starting point of the slice as a dictionary formatted "{'x': x-coord, 'y': y-coord}" in pixel units. ap_slice_pa : float, default None Position angle of the slice in degrees, counter-clockwise relative to the x-axis. ap_slice_length : float, default None Length of the slice from anchor point in pixel units. By default, use init ellipse semi-major axis length ap_slice_width : float, default 10 Width of the slice in pixel units. ap_slice_step : float, default None Distance between samples for the profile along the slice. By default use the PSF. ap_isoaverage_method : string, default 'median' Select the method used to compute the averafge flux along an isophote. Choose from 'mean', 'median', and 'mode'. In general, median is fast and robust to a few outliers. Mode is slow but robust to more outliers. Mean is fast and accurate in low S/N regimes where fluxes take on near integer values, but not robust to outliers. The mean should be used along with a mask to remove spurious objects such as foreground stars or galaxies, and should always be used with caution. ap_saveto : string, default None Directory in which to save profile ap_name : string, default None Name of the current galaxy, used for making filenames. ap_zeropoint : float, default 22.5 Photometric zero point. For converting flux to mag units. Notes ---------- :References: - 'background' (optional) - 'background noise' (optional) - 'center' (optional) - 'init R' (optional) - 'init pa' (optional) Returns ------- IMG : ndarray Unaltered galaxy image results : dict .. code-block:: python {} """ dat = IMG - (results["background"] if "background" in results else np.median(IMG)) zeropoint = options["ap_zeropoint"] if "ap_zeropoint" in options else 22.5 use_anchor = ( results["center"] if "center" in results else {"x": IMG.shape[1] / 2, "y": IMG.shape[0] / 2} ) if "ap_slice_anchor" in options: use_anchor = options["ap_slice_anchor"] else: logging.warning( "%s: ap_slice_anchor not specified by user, using: %s" % (options["ap_name"], str(use_anchor)) ) use_pa = results["init pa"] if "init pa" in results else 0.0 if "ap_slice_pa" in options: use_pa = options["ap_slice_pa"] * np.pi / 180 else: logging.warning( "%s: ap_slice_pa not specified by user, using: %.2f" % (options["ap_name"], use_pa) ) use_length = results["init R"] if "init R" in results else min(IMG.shape) if "ap_slice_length" in options: use_length = options["ap_slice_length"] else: logging.warning( "%s: ap_slice_length not specified by user, using: %.2f" % (options["ap_name"], use_length) ) use_width = 10.0 if "ap_slice_width" in options: use_width = options["ap_slice_width"] else: logging.warning( "%s: ap_slice_width not specified by user, using: %.2f" % (options["ap_name"], use_width) ) use_step = ( results["psf fwhm"] if "psf fwhm" in results else max(2.0, use_length / 100) ) if "ap_slice_step" in options: use_step = options["ap_slice_step"] else: logging.warning( "%s: ap_slice_step not specified by user, using: %.2f" % (options["ap_name"], use_step) ) F, X = _iso_line(dat, use_length, use_width, use_pa, use_anchor, more=False) windows = np.arange(0, use_length, use_step) R = (windows[1:] + windows[:-1]) / 2 sb = [] sb_e = [] sb_sclip = [] sb_sclip_e = [] for i in range(len(windows) - 1): isovals = F[np.logical_and(X >= windows[i], X < windows[i + 1])] isovals_sclip = Sigma_Clip_Upper(isovals, iterations=10, nsigma=5) medflux = _average( isovals, options["ap_isoaverage_method"] if "ap_isoaverage_method" in options else "median", ) scatflux = _scatter( isovals, options["ap_isoaverage_method"] if "ap_isoaverage_method" in options else "median", ) medflux_sclip = _average( isovals_sclip, options["ap_isoaverage_method"] if "ap_isoaverage_method" in options else "median", ) scatflux_sclip = _scatter( isovals_sclip, options["ap_isoaverage_method"] if "ap_isoaverage_method" in options else "median", ) sb.append( flux_to_sb(medflux, options["ap_pixscale"], zeropoint) if medflux > 0 else 99.999 ) sb_e.append( (2.5 * scatflux / (np.sqrt(len(isovals)) * medflux * np.log(10))) if medflux > 0 else 99.999 ) sb_sclip.append( flux_to_sb(medflux_sclip, options["ap_pixscale"], zeropoint) if medflux_sclip > 0 else 99.999 ) sb_sclip_e.append( ( 2.5 * scatflux_sclip / (np.sqrt(len(isovals)) * medflux_sclip * np.log(10)) ) if medflux_sclip > 0 else 99.999 ) with open( "%s%s_slice_profile.prof" % ( (options["ap_saveto"] if "ap_saveto" in options else ""), options["ap_name"], ), "w", ) as f: f.write( "# flux sum: %f\n" % (np.sum(F[np.logical_and(X >= 0, X <= use_length)])) ) f.write( "# flux mean: %f\n" % (_average(F[np.logical_and(X >= 0, X <= use_length)], "mean")) ) f.write( "# flux median: %f\n" % (_average(F[np.logical_and(X >= 0, X <= use_length)], "median")) ) f.write( "# flux mode: %f\n" % (_average(F[np.logical_and(X >= 0, X <= use_length)], "mode")) ) f.write( "# flux std: %f\n" % (np.std(F[np.logical_and(X >= 0, X <= use_length)])) ) f.write( "# flux 16-84%% range: %f\n" % (iqr(F[np.logical_and(X >= 0, X <= use_length)], rng=[16, 84])) ) f.write("R,sb,sb_e,sb_sclip,sb_sclip_e\n") f.write("arcsec,mag*arcsec^-2,mag*arcsec^-2,mag*arcsec^-2,mag*arcsec^-2\n") for i in range(len(R)): f.write( "%.4f,%.4f,%.4f,%.4f,%.4f\n" % ( R[i] * options["ap_pixscale"], sb[i], sb_e[i], sb_sclip[i], sb_sclip_e[i], ) ) if "ap_doplot" in options and options["ap_doplot"]: CHOOSE = np.array(sb_e) < 0.5 plt.errorbar( np.array(R)[CHOOSE] * options["ap_pixscale"], np.array(sb)[CHOOSE], yerr=np.array(sb_e)[CHOOSE], elinewidth=1, linewidth=0, marker=".", markersize=3, color="r", ) plt.xlabel("Position on line [arcsec]", fontsize=16) plt.ylabel("Surface Brightness [mag arcsec$^{-2}$]", fontsize=16) if "background noise" in results: bkgrdnoise = ( -2.5 * np.log10(results["background noise"]) + zeropoint + 2.5 * np.log10(options["ap_pixscale"] ** 2) ) plt.axhline( bkgrdnoise, color="purple", linewidth=0.5, linestyle="--", label="1$\\sigma$ noise/pixel: %.1f mag arcsec$^{-2}$" % bkgrdnoise, ) plt.gca().invert_yaxis() plt.legend(fontsize=15) plt.tick_params(labelsize=14) plt.tight_layout() if not ("ap_nologo" in options and options["ap_nologo"]): AddLogo(plt.gcf()) plt.savefig( "%sslice_profile_%s.jpg" % ( options["ap_plotpath"] if "ap_plotpath" in options else "", options["ap_name"], ), dpi=options["ap_plotdpi"] if "ap_plotdpi" in options else 300, ) plt.close() ranges = [ [ max( 0, int( use_anchor["x"] + 0.5 * use_length * np.cos(use_pa) - use_length * 0.7 ), ), min( IMG.shape[1], int( use_anchor["x"] + 0.5 * use_length * np.cos(use_pa) + use_length * 0.7 ), ), ], [ max( 0, int( use_anchor["y"] + 0.5 * use_length * np.sin(use_pa) - use_length * 0.7 ), ), min( IMG.shape[0], int( use_anchor["y"] + 0.5 * use_length * np.sin(use_pa) + use_length * 0.7 ), ), ], ] LSBImage( dat[ranges[1][0] : ranges[1][1], ranges[0][0] : ranges[0][1]], results["background noise"] if "background noise" in results else iqr(dat, rng=(31.731 / 2, 100 - 31.731 / 2)) / 2, ) XX, YY = np.meshgrid( np.arange(ranges[0][1] - ranges[0][0], dtype=float), np.arange(ranges[1][1] - ranges[1][0], dtype=float), ) XX -= use_anchor["x"] - float(ranges[0][0]) YY -= use_anchor["y"] - float(ranges[1][0]) XX, YY = ( XX * np.cos(-use_pa) - YY * np.sin(-use_pa), XX * np.sin(-use_pa) + YY * np.cos(-use_pa), ) ZZ = np.ones(XX.shape) ZZ[ np.logical_not( np.logical_and( np.logical_and(YY <= use_width / 2, YY >= -use_width / 2), np.logical_and(XX >= 0, XX <= use_length), ) ) ] = np.nan plt.imshow(ZZ, origin="lower", cmap="Reds_r", alpha=0.6) plt.tight_layout() if not ("ap_nologo" in options and options["ap_nologo"]): AddLogo(plt.gcf()) plt.savefig( "%sslice_profile_window_%s.jpg" % ( options["ap_plotpath"] if "ap_plotpath" in options else "", options["ap_name"], ), dpi=options["ap_plotdpi"] if "ap_plotdpi" in options else 300, ) plt.close() return IMG, {}
def _demo_validate_real_data(): ds_rate = 2 channel_map = [1] * 16 + [0, 0, 1, 1, 0, 1, 1, 1, 0] data_train_folder = load_experimental_data() mode = 'calibration' raw_dat, stamp_time, channels, type_amp, fs = read_data_csv( data_train_folder + '/rawdata.csv') dat = sig_pro(raw_dat, fs=fs, k=ds_rate) # Get data and labels s_i, t_t_i, t_i = trigger_decoder(mode=mode, trigger_loc=data_train_folder + '/triggers.txt') x_train, y_train, num_seq, _ = trial_reshaper(t_t_i, t_i, dat, mode=mode, fs=fs, k=ds_rate, channel_map=channel_map) model = train_pca_rda_kde_model(x_train, y_train, k_folds=10) fig = plt.figure() ax = fig.add_subplot(211) x_plot = np.linspace(np.min(model.line_el[-1]), np.max(model.line_el[-1]), 1000)[:, np.newaxis] ax.plot(model.line_el[2][y_train == 0], -0.005 - 0.01 * np.random.random(model.line_el[2][y_train == 0].shape[0]), 'ro', label='class(-)') ax.plot(model.line_el[2][y_train == 1], -0.005 - 0.01 * np.random.random(model.line_el[2][y_train == 1].shape[0]), 'go', label='class(+)') for idx in range(len(model.pipeline[2].list_den_est)): log_dens = model.pipeline[2].list_den_est[idx].score_samples(x_plot) ax.plot(x_plot[:, 0], np.exp(log_dens), 'r-' * (idx == 0) + 'g-' * (idx == 1), linewidth=2.0) ax.legend(loc='upper right') plt.title('Training Data') plt.ylabel('p(e|l)') plt.xlabel('scores') # Test data_test_folder = load_experimental_data() mode = 'calibration' raw_dat, stamp_time, channels, type_amp, fs = read_data_csv( data_test_folder + '/rawdata.csv') dat = sig_pro(raw_dat, fs=fs, k=ds_rate) # Get data and labels s_i, t_t_i, t_i = trigger_decoder(mode=mode, trigger_loc=data_test_folder + '/triggers.txt') x_test, y_test, num_seq, _ = trial_reshaper(t_t_i, t_i, dat, mode=mode, fs=fs, k=ds_rate, channel_map=channel_map) model.transform(x_test) ax.plot(model.line_el[2][y_test == 0], -0.01 - 0.01 * np.random.random(model.line_el[2][y_test == 0].shape[0]), 'bo', label='t_class(-)') ax.plot(model.line_el[2][y_test == 1], -0.01 - 0.01 * np.random.random(model.line_el[2][y_test == 1].shape[0]), 'ko', label='t_class(+)') bandwidth = 1.06 * min(np.std(model.line_el[2]), iqr(model.line_el[2]) / 1.34) * np.power( model.line_el[2].shape[0], -0.2) test_kde = KernelDensityEstimate(bandwidth=bandwidth) test_kde.fit(model.line_el[2], y_test) for idx in range(len(model.pipeline[2].list_den_est)): log_dens = test_kde.list_den_est[idx].score_samples(x_plot) ax.plot(x_plot[:, 0], np.exp(log_dens), 'b--' * (idx == 0) + 'k--' * (idx == 1), linewidth=2.0) ax.legend(loc='upper right') plt.title('Training Data') plt.ylabel('p(e|l)') plt.xlabel('scores') plt.show()
fSports = [] fMusic = [] fGames = [] fBSD = [] fRSD = [] fTSD = [] fThSD = [] fRN = [] fBN = [] fTN = [] fThN = [] for i in range(0, len(RA)): if BA[i] < np.percentile(BA, 75) + 1.5 * iqr(BA) and BA[i] > np.percentile( BA, 25) - 1.5 * iqr(BA) and NSD(0)[i] < np.percentile( NSD(0), 75) + 1.5 * iqr(NSD(0)) and NSD(0)[i] > np.percentile( NSD(0), 25) - 1.5 * iqr(NSD(0)) and RA[i] < np.percentile( RA, 75) + 1.5 * iqr(RA) and RA[i] > np.percentile( RA, 25 ) - 1.5 * iqr(RA) and NSD(1)[i] < np.percentile( NSD(1), 75 ) + 1.5 * iqr(NSD(1)) and NSD(1)[i] > np.percentile( NSD(1), 25 ) - 1.5 * iqr(NSD(1)) and TA[i] < np.percentile( TA, 75) + 1.5 * iqr(TA) and TA[i] > np.percentile( TA, 25 ) - 1.5 * iqr(TA) and NSD(2)[i] < np.percentile( NSD(2), 75) + 1.5 * iqr(NSD(2)) and NSD( 2
import numpy as np from scipy.stats import iqr # 1.1 'd' has an outlier d = np.array([2, 4, 6, 8, 10, 12, 14, 16, 40]) d = d.reshape(-1, 1) d # 1.2 Transform using RobustScaler rs = RobustScaler() print("\n1.0 RobustScaler result:\n\n", rs.fit_transform(d)) print() # 1.3 Calculate manually: MEDIAN = np.median(d) IQR = iqr(d) print("2.0 Manual calculations result\n\n", (d - MEDIAN) / IQR) # Result same as by RobustSclaer print("Both the above results are same.") print("===============================") ############ print("\nNext, remove outlier and see results") # 2.0 Remove outlier and see results d1 = np.array([2, 4, 6, 8, 10, 12, 14, 16]) d1 = d1.reshape(-1, 1) d1 # 2.1 Transform using RobustScaler rs = RobustScaler() print("\nRobustScaler result with outlier removed:\n", rs.fit_transform(d1))
if v.mean().values[0] > best: best = v.mean().values[0] abest = (k, v['Harmonic mean']) for k, v in __.groupby('Algorithms'): # print(abest[0]) if k != abest[0]: from scipy import stats s, p = stats.ttest_ind(abest[1], v['Harmonic mean'].values) if (p <= 0.05): print("*T-Test:", abest[0], k, s, p) else: print("T-Test:", abest[0], k, s, p) from scipy.stats import ranksums as kruskal from scipy.stats import iqr plot(dfh, 'Harmonic mean', xcl) for _, __ in dfh.groupby(['DataSet']): print(_) for k1, v1 in __.groupby('Algorithms'): print(k1, 'mdn:%0.2f' % v1['Harmonic mean'].median(), 'iqr:%0.2f' % iqr(v1['Harmonic mean'])) for k, v in __.groupby('Algorithms'): s, p = kruskal(v1['Harmonic mean'], v['Harmonic mean']) print('mdn:%0.2f' % v['Harmonic mean'].median(), 'iqr:%0.2f' % iqr(v['Harmonic mean']), 'p:%0.5f' % p, 's:%0.5f' % s, _, k, k1) for _, __ in dfh.groupby(['DataSet']): print(_) for k1, v1 in __.groupby('Algorithms'): print(k1, 'mdn:%0.2f' % v1['Harmonic mean'].median(), 'iqr:%0.2f' % iqr(v1['Harmonic mean']))
def __getitem__(self, idx): idx = idx % self.img_fn_len #img_fn = self.img_fns[idx] #img = Image.open(img_fn) bad_img_counts = 0 while True: try: img_fn = self.img_fns[idx] img = Image.open(img_fn) img_np = np.asarray(img) if img_np is None or np.prod(list(img_np.shape)) == 0: idx += 1 idx = idx % self.img_fn_len bad_img_counts += 1 print("Bad data1: {}\n".format(img_fn)) continue if bad_img_counts > 100: break break except: idx += 1 bad_img_counts += 1 idx = idx % self.img_fn_len print("Bad data2: {}\n".format(img_fn)) if bad_img_counts > 100: break img = img.resize((1024,512)) width, height = img.size gray = np.asarray(img.convert('L')) img = transforms.functional.to_tensor(img) bg_image = img.detach().clone() p_neg = np.random.rand() > 0.05 og_overlay_image = None if p_neg: if self.debug_train: overlay_fn = self.overlay_fns[self.rn] else: overlay_fn = self.overlay_fns[randint(0, self.n_overlays-1)] overlay_img = Image.open(overlay_fn).convert('L') og_overlay_image = transforms.functional.to_tensor(overlay_img) spatial_augs = transforms.Compose([ transforms.RandomHorizontalFlip(p=0.5), transforms.RandomAffine(degrees=60, translate=(.3, .1), scale=(.25, .5), fillcolor=(255)), transforms.Resize((height, width)), ] if not self.debug_train else [transforms.Resize((height, width))]) overlay_img = spatial_augs(overlay_img) overlay_img = 1.0 - transforms.functional.to_tensor(overlay_img) # create mask according to iqr and add color jitter iqr_val = iqr(overlay_img[overlay_img > 0]) mask = (overlay_img >= iqr_val).squeeze().float() overlay_img = transforms.functional.to_pil_image(overlay_img) # cj = transforms.ColorJitter(brightness=(0.8, 1.2), saturation=(0.8, 1.0)) # overlay_img = cj(overlay_img) mean_gray = norm(gray) spatial_augs2 = transforms.Compose([ transforms.Grayscale(), transforms.ToTensor() ]) ### DEBUG overlay_img = spatial_augs2(overlay_img).squeeze().numpy() mult = np.power(mean_gray, 0.2) #np.ones_like(mean_gray) final_img2 = transforms.functional.to_tensor(np.multiply(mult, overlay_img)).float() ### DEBUG img[:, mask==1] = final_img2[0, mask==1] else: mask = torch.zeros_like(img[0, :, :]) img_spatial_augs = transforms.Compose([ transforms.ToPILImage(), transforms.RandomHorizontalFlip(p=0.5), transforms.RandomAffine(degrees=0, translate=(0.01, 0.01)), transforms.CenterCrop((height-40, width-60)), transforms.Resize((height, width)), ] if not self.debug_train else [transforms.CenterCrop((height-40, width-60)), transforms.Resize((height, width))]) # use same random seed for both augmentations seed = np.random.randint(2147483647) random.seed(seed) torch.manual_seed(seed=seed) img = img_spatial_augs(img) random.seed(seed) torch.manual_seed(seed=seed) mask = img_spatial_augs(mask) img_color_augs = transforms.Compose([ transforms.ColorJitter(brightness=(0.7, 1.3), saturation=(0.7, 1.2), contrast=(0.8, 1.2)), transforms.ToTensor() ] if not self.debug_train else [transforms.ToTensor()]) img = img_color_augs(img) mask = transforms.functional.to_tensor(mask) #if img is None or mask is None or og_overlay_image is None or idx is None: #print("image is None or mask is none") output_dict = { "idx": idx, "input_img": img, "target_mask": mask, "bg": bg_image, "fg": og_overlay_image } return output_dict
import scipy.stats as stats import numpy as np #%% data import with open('population_countries.csv', 'r', encoding="utf-8", errors="ignore") as fin: datarray = listmaker(fin) #%% Calculating Descriptive Stats pop_arith_mean = np.mean(datarray) pop_geo_mean = stats.gmean(datarray) pop_median = np.median(datarray) pop_mode = stats.mode(datarray) pop_std = np.std(datarray) pop_iqr = stats.iqr(datarray) pop_skew = stats.skew(datarray) pop_num_miss = sum(np.isnan(datarray)) #%% Plotting Other Stats binlist = np.logspace(np.log(1000), np.log10(20000000000), 20) plt.xkcd() _ = plt.hist(datarray, bins=binlist) _ = plt.xscale("log") _ = plt.xlabel("Number of Inhabitants") plt.show() _ = plt.boxplot(datarray) _ = plt.yscale("log")
x = slider1.slide() fft_x = abs(np.fft.rfft(x)) n = fft_x.size #sample_rate = nbrOfSample/recordingTime fft_x_freq = np.fft.rfftfreq(x.size, d=1. / sample_rate) #Calculate values to cvs-file meanx = st.mean(fft_x) mad1x = pd.Series(fft_x) madx = mad1x.mad() maxx = max(fft_x) #print("max", maxx) minx = min(fft_x) stdx = st.stdev(fft_x) iqx = iqr(fft_x) #Calculate signal entropy sx = pd.Series(fft_x) vectorx = (sx.groupby(sx).transform('count') / len(sx)).values entrox = entropy(vectorx) #Calculate signal energy resx = sum(map(lambda i: i * i, fft_x)) energyx = resx / (fft_x.size) #Calculate SMA integralx = np.trapz(fft_x, dx=timediff) t = timediff * (fft_x.size) SMA = (1 / t) * (integralx) #skewness & kurtosis sk = skew(fft_x)
def create_feature_array(self, current_epoch): xData = current_epoch['X'].to_numpy( ) #self.plot_signal(xData, 'xData') yData = current_epoch['Y'].to_numpy() zData = current_epoch['Z'].to_numpy() # Filter the data fc = 5 # Cut-off frequency of the filter w = fc / (20 / 2) # Normalize the frequency b, a = signal.butter(5, w, 'low') xDataFilt = signal.filtfilt(b, a, xData) yDataFilt = signal.filtfilt(b, a, yData) zDataFilt = signal.filtfilt(b, a, zData) # Calculate vector magnitude vmData = np.sqrt(xDataFilt**2 + yDataFilt**2 + zDataFilt**2) feature_array = [] # Average value in signal buffer for all acceleration components feature_array = np.append(feature_array, np.mean(xDataFilt)) feature_array = np.append(feature_array, np.mean(yDataFilt)) feature_array = np.append(feature_array, np.mean(zDataFilt)) feature_array = np.append(feature_array, np.mean(vmData)) # Standard deviation feature_array = np.append(feature_array, np.std(xDataFilt)) feature_array = np.append(feature_array, np.std(yDataFilt)) feature_array = np.append(feature_array, np.std(zDataFilt)) feature_array = np.append(feature_array, np.std(vmData)) # Median absolute deviation feature_array = np.append(feature_array, stats.median_absolute_deviation(xDataFilt)) feature_array = np.append(feature_array, stats.median_absolute_deviation(yDataFilt)) feature_array = np.append(feature_array, stats.median_absolute_deviation(zDataFilt)) feature_array = np.append(feature_array, stats.median_absolute_deviation(vmData)) # Maximum sample feature_array = np.append(feature_array, np.max(xDataFilt)) feature_array = np.append(feature_array, np.max(yDataFilt)) feature_array = np.append(feature_array, np.max(zDataFilt)) feature_array = np.append(feature_array, np.max(vmData)) # Minimum sample feature_array = np.append(feature_array, np.min(xDataFilt)) feature_array = np.append(feature_array, np.min(yDataFilt)) feature_array = np.append(feature_array, np.min(zDataFilt)) feature_array = np.append(feature_array, np.min(vmData)) # Signal magnitude area feature_array = np.append(feature_array, np.trapz(xDataFilt)) feature_array = np.append(feature_array, np.trapz(yDataFilt)) feature_array = np.append(feature_array, np.trapz(zDataFilt)) feature_array = np.append(feature_array, np.trapz(vmData)) # Energy measure energy = np.sum(xDataFilt**2) / len(xDataFilt) feature_array = np.append(feature_array, energy) energy = np.sum(yDataFilt**2) / len(yDataFilt) feature_array = np.append(feature_array, energy) energy = np.sum(zDataFilt**2) / len(zDataFilt) feature_array = np.append(feature_array, energy) energy = np.sum(vmData**2) / len(vmData) feature_array = np.append(feature_array, energy) # Inter-quartile range feature_array = np.append(feature_array, stats.iqr(xDataFilt, axis=0)) feature_array = np.append(feature_array, stats.iqr(yDataFilt, axis=0)) feature_array = np.append(feature_array, stats.iqr(zDataFilt, axis=0)) feature_array = np.append(feature_array, stats.iqr(vmData, axis=0)) # Autocorrelation features for all three acceleration components (3 each): height of main peak; height and position of second peak - Not sure this is right? autocorrelation = np.correlate(xDataFilt, xDataFilt, mode='full') autocorrelation = autocorrelation[len(xDataFilt) - 1:][0] feature_array = np.append(feature_array, autocorrelation) autocorrelation = np.correlate(yDataFilt, yDataFilt, mode='full') autocorrelation = autocorrelation[len(yDataFilt) - 1:][0] feature_array = np.append(feature_array, autocorrelation) autocorrelation = np.correlate(zDataFilt, zDataFilt, mode='full') autocorrelation = autocorrelation[len(zDataFilt) - 1:][0] feature_array = np.append(feature_array, autocorrelation) autocorrelation = np.correlate(vmData, vmData, mode='full') autocorrelation = autocorrelation[len(vmData) - 1:][0] feature_array = np.append(feature_array, autocorrelation) # Spectral peak features (12 each): height and position of first 6 peaks f, p = signal.periodogram(xDataFilt, 20e0) sort_index = np.argsort(p) p_sorted = p[sort_index] f_sorted = f[sort_index] speak_feats = p_sorted[-6:] speak_feats2 = f_sorted[-6:] feature_array = np.append(feature_array, speak_feats) feature_array = np.append(feature_array, speak_feats2) f, p = signal.periodogram(yDataFilt, 20e0) sort_index = np.argsort(p) p_sorted = p[sort_index] f_sorted = f[sort_index] speak_feats = p_sorted[-6:] speak_feats2 = f_sorted[-6:] feature_array = np.append(feature_array, speak_feats) feature_array = np.append(feature_array, speak_feats2) f, p = signal.periodogram(zDataFilt, 20e0) sort_index = np.argsort(p) p_sorted = p[sort_index] f_sorted = f[sort_index] speak_feats = p_sorted[-6:] speak_feats2 = f_sorted[-6:] feature_array = np.append(feature_array, speak_feats) feature_array = np.append(feature_array, speak_feats2) f, p = signal.periodogram(vmData, 20e0) sort_index = np.argsort(p) p_sorted = p[sort_index] f_sorted = f[sort_index] speak_feats = p_sorted[-6:] speak_feats2 = f_sorted[-6:] feature_array = np.append(feature_array, speak_feats) feature_array = np.append(feature_array, speak_feats2) # Spectral power features (4 each): total power in 4 adjacent and pre-defined frequency bands edges = [0.5, 1.5, 5, 7.5, 10] n_feats = len(edges) - 1 spower_feats = [] f, p = signal.periodogram(xDataFilt, 20e0) for i in range(n_feats): mask = (f >= edges[i]) & (f <= edges[i + 1]) sum(p[mask]) spower_feats = np.append(spower_feats, sum(p[mask])) feature_array = np.append(feature_array, spower_feats) spower_feats = [] f, p = signal.periodogram(yDataFilt, 20e0) for i in range(n_feats): mask = (f >= edges[i]) & (f <= edges[i + 1]) sum(p[mask]) spower_feats = np.append(spower_feats, sum(p[mask])) feature_array = np.append(feature_array, spower_feats) spower_feats = [] f, p = signal.periodogram(zDataFilt, 20e0) for i in range(n_feats): mask = (f >= edges[i]) & (f <= edges[i + 1]) sum(p[mask]) spower_feats = np.append(spower_feats, sum(p[mask])) feature_array = np.append(feature_array, spower_feats) spower_feats = [] f, p = signal.periodogram(vmData, 20e0) for i in range(n_feats): mask = (f >= edges[i]) & (f <= edges[i + 1]) sum(p[mask]) spower_feats = np.append(spower_feats, sum(p[mask])) feature_array = np.append(feature_array, spower_feats) return feature_array
plt.title("error between forwardly solved y-field from prediction and true input y-field for the 20th battery in test set with distance 1.59e"+str(key)) plt.savefig(str(dist)+"/test_20_yfield_diff.png") plt.figure() imshow_center(np.squeeze(X_test[0,:,:,1])-zf) plt.title("error between forwardly solved z-field from prediction and true input y-field for the 20th battery in test set with distance 1.59e"+str(key)) plt.savefig(str(dist)+"/test_20_zfield_diff.png") final_loss = custom_loss_rmse(test_labels_t2b, y_pred_ht2) print('final RMSE loss on test set:', final_loss.numpy()) NRMSE = final_loss/K.mean(test_labels_t2b) print('final normalized RMSE loss (div mean) on the test set:', NRMSE.numpy()) RMSE_range = final_loss /(tf.reduce_max(test_labels_t2b) - tf.reduce_min(test_labels_t2b)) print('final normalized RMSE loss (div range) on the test set:', RMSE_range.numpy()) test_arr = tf.keras.backend.flatten(test_labels_t2b).numpy() IQR = stats.iqr(test_arr) RMSE_IQR = final_loss/IQR print('final normalized RMSE loss (div IQR) on the test set:', RMSE_IQR.numpy()) print('final norm of the difference tensor:', tf.norm(y_pred_ht2-test_labels_t2b).numpy()) Boll_NRMSE = tf.norm(y_pred_ht2-test_labels_t2b) / tf.norm(test_labels_t2b) print('final Bollman normalized RMSE loss on the test set:', Boll_NRMSE.numpy()) specs_dict['final_RMSE'] = final_loss.numpy() specs_dict['NRMSE'] = NRMSE.numpy() specs_dict['RMSE_range'] = RMSE_range.numpy() specs_dict['RMSE_IQR'] = RMSE_IQR.numpy() specs_dict['Boll_NRMSE'] = Boll_NRMSE.numpy() result_dict[key] = specs_dict print(100*'|')
def get_linguistic_metadata(wsdir, master_anno, idno_file): #print(wsdir + master_anno + idno_file) root_document = etree.parse(wsdir + master_anno + idno_file).getroot() #print(len(root_document)) specific_namespaces = { 'tei': 'http://www.tei-c.org/ns/1.0', 'xi': 'http://www.w3.org/2001/XInclude', 'cligs': 'https://cligs.hypotheses.org/ns/cligs' } poss = [ "conjunction", "determiner", "noun", "verb", "adverb", "adjective", "adposition", "punctuation", "pronoun", "date", "number", "interjection" ] ling_measures = "\n" types_vaues = root_document.xpath("//tei:w//text()", namespaces=specific_namespaces) ling_measures = ling_measures + '\n\t\t\t\t<measure unit="types">' + str( len(set(types_vaues))) + r'</measure>' ling_measures += "\n" tags = ["s", "w"] for tag in tags: #print(tag) tag_elements = root_document.xpath("//tei:" + tag, namespaces=specific_namespaces) ling_measures = ling_measures + '\n\t\t\t\t<measure unit="' + tag + r's">' + str( len(tag_elements)) + r'</measure>' #print(len(tag_elements)) len_tag = [] amount_act_verbs_text = [] for tag_element in tag_elements: len_tag.append( len(" ".join( tag_element.xpath(".//text()", namespaces=specific_namespaces)))) if tag == "s": amount_active_verbs = len( tag_element.xpath("./tei:w[@cligs:ctag='VMI']", namespaces=specific_namespaces)) amount_active_verbs += len( tag_element.xpath("./tei:w[@cligs:ctag='VSI']", namespaces=specific_namespaces)) amount_act_verbs_text.append(amount_active_verbs) len_tag = np.array(len_tag) ling_measures = ling_measures + '\n\t\t\t\t<measure unit="' + tag + r's.num.mean">' + str( "%.2f" % round(len_tag.mean(), 2)) + r'</measure>' ling_measures = ling_measures + '\n\t\t\t\t<measure unit="' + tag + r's.num.std">' + str( "%.2f" % round(len_tag.std(), 2)) + r'</measure>' ling_measures = ling_measures + '\n\t\t\t\t<measure unit="' + tag + r's.num.median">' + str( "%.2f" % round(np.percentile(len_tag, q=50), 2)) + r'</measure>' ling_measures = ling_measures + '\n\t\t\t\t<measure unit="' + tag + r's.num.iqr">' + str( "%.2f" % round(stats.iqr(len_tag), 2)) + r'</measure>' if tag == "s": amount_act_verbs_text = np.array(amount_act_verbs_text) ling_measures = ling_measures + '\n\t\t\t\t<measure unit="ss.active.verbs.mean">' + str( "%.2f" % round(amount_act_verbs_text.mean(), 2)) + r'</measure>' ling_measures = ling_measures + '\n\t\t\t\t<measure unit="ss.active.verbs.std">' + str( "%.2f" % round(amount_act_verbs_text.std(), 2)) + r'</measure>' ling_measures = ling_measures + '\n\t\t\t\t<measure unit="ss.active.verbs.median">' + str( "%.2f" % round(np.percentile(amount_act_verbs_text, q=50), 2)) + r'</measure>' ling_measures = ling_measures + '\n\t\t\t\t<measure unit="ss.active.verbs.iqr">' + str( "%.2f" % round(stats.iqr(amount_act_verbs_text), 2)) + r'</measure>' ling_measures += "\n" ling_measures += "\n" for pos in poss: pos_value = str( len( root_document.xpath("//tei:w[@pos='" + pos + "']", namespaces=specific_namespaces))) ling_measures = ling_measures + '\n\t\t\t\t<measure unit="' + pos + 's">' + pos_value + r'</measure>' ling_measures += "\n" nes = ["person", "organization", "location", "other"] for ne in nes: ne_value = str( len( root_document.xpath("//tei:w[@cligs:neclass='" + ne + "']", namespaces=specific_namespaces))) ling_measures = ling_measures + '\n\t\t\t\t<measure unit="ne.' + ne + 's">' + ne_value + r'</measure>' ling_measures += "\n" wnlexs = [ 'noun.plant', 'verb.communication', 'noun.food', 'verb.possession', 'verb.cognition', 'noun.communication', 'noun.state', 'verb.stative', 'noun.cognition', 'noun.time', 'verb.body', 'noun.person', 'adj.all', 'noun.quantity', 'noun.phenomenon', 'verb.creation', 'adj.pert', 'adv.all', 'noun.process', 'noun.artifact', 'verb.perception', 'noun.feeling', 'verb.weather', 'noun.substance', 'noun.shape', 'verb.competition', 'verb.motion', 'noun.animal', 'noun.act', 'noun.body', 'noun.object', 'noun.motive', 'verb.social', 'noun.group', 'verb.consumption', 'noun.possession', 'noun.Tops', 'noun.relation', 'noun.attribute', 'verb.emotion', 'noun.location', 'noun.event', 'verb.contact', 'xxx', 'verb.change' ] for wnlex in wnlexs: wnlex_value = str( len( root_document.xpath("//tei:w[@cligs:wnlex='" + wnlex + "']", namespaces=specific_namespaces))) ling_measures = ling_measures + '\n\t\t\t\t<measure unit="wnlex.' + wnlex + 's">' + wnlex_value + r'</measure>' return ling_measures
def epps_singleton_2samp(x, y, t=(0.4, 0.8)): """ Compute the Epps-Singleton (ES) test statistic. Test the null hypothesis that two samples have the same underlying probability distribution. Parameters ---------- x, y : array-like The two samples of observations to be tested. Input must not have more than one dimension. Samples can have different lengths. t : array-like, optional The points (t1, ..., tn) where the empirical characteristic function is to be evaluated. It should be positive distinct numbers. The default value (0.4, 0.8) is proposed in [1]_. Input must not have more than one dimension. Returns ------- statistic : float The test statistic. pvalue : float The associated p-value based on the asymptotic chi2-distribution. See Also -------- ks_2samp, anderson_ksamp Notes ----- Testing whether two samples are generated by the same underlying distribution is a classical question in statistics. A widely used test is the Kolmogorov-Smirnov (KS) test which relies on the empirical distribution function. Epps and Singleton introduce a test based on the empirical characteristic function in [1]_. One advantage of the ES test compared to the KS test is that is does not assume a continuous distribution. In [1]_, the authors conclude that the test also has a higher power than the KS test in many examples. They recommend the use of the ES test for discrete samples as well as continuous samples with at least 25 observations each, whereas `anderson_ksamp` is recommended for smaller sample sizes in the continuous case. The p-value is computed from the asymptotic distribution of the test statistic which follows a `chi2` distribution. If the sample size of both `x` and `y` is below 25, the small sample correction proposed in [1]_ is applied to the test statistic. The default values of `t` are determined in [1]_ by considering various distributions and finding good values that lead to a high power of the test in general. Table III in [1]_ gives the optimal values for the distributions tested in that study. The values of `t` are scaled by the semi-interquartile range in the implementation, see [1]_. References ---------- .. [1] T. W. Epps and K. J. Singleton, "An omnibus test for the two-sample problem using the empirical characteristic function", Journal of Statistical Computation and Simulation 26, p. 177--203, 1986. .. [2] S. J. Goerg and J. Kaiser, "Nonparametric testing of distributions - the Epps-Singleton two-sample test using the empirical characteristic function", The Stata Journal 9(3), p. 454--465, 2009. """ x, y, t = np.asarray(x), np.asarray(y), np.asarray(t) # check if x and y are valid inputs if x.ndim > 1: raise ValueError('x must be 1d, but x.ndim equals {}.'.format(x.ndim)) if y.ndim > 1: raise ValueError('y must be 1d, but y.ndim equals {}.'.format(y.ndim)) nx, ny = len(x), len(y) if (nx < 5) or (ny < 5): raise ValueError('x and y should have at least 5 elements, but len(x) ' '= {} and len(y) = {}.'.format(nx, ny)) if not np.isfinite(x).all(): raise ValueError('x must not contain nonfinite values.') if not np.isfinite(y).all(): raise ValueError('y must not contain nonfinite values.') n = nx + ny # check if t is valid if t.ndim > 1: raise ValueError('t must be 1d, but t.ndim equals {}.'.format(t.ndim)) if np.less_equal(t, 0).any(): raise ValueError('t must contain positive elements only.') # rescale t with semi-iqr as proposed in [1]; import iqr here to avoid # circular import from scipy.stats import iqr sigma = iqr(np.hstack((x, y))) / 2 ts = np.reshape(t, (-1, 1)) / sigma # covariance estimation of ES test gx = np.vstack( (np.cos(ts * x), np.sin(ts * x))).T # shape = (nx, 2*len(t)) gy = np.vstack((np.cos(ts * y), np.sin(ts * y))).T cov_x = np.cov(gx.T, bias=True) # the test uses biased cov-estimate cov_y = np.cov(gy.T, bias=True) est_cov = (n / nx) * cov_x + (n / ny) * cov_y est_cov_inv = np.linalg.pinv(est_cov) r = np.linalg.matrix_rank(est_cov_inv) if r < 2 * len(t): warnings.warn('Estimated covariance matrix does not have full rank. ' 'This indicates a bad choice of the input t and the ' 'test might not be consistent.') # see p. 183 in [1]_ # compute test statistic w distributed asympt. as chisquare with df=r g_diff = np.mean(gx, axis=0) - np.mean(gy, axis=0) w = n * np.dot(g_diff.T, np.dot(est_cov_inv, g_diff)) # apply small-sample correction if (max(nx, ny) < 25): corr = 1.0 / (1.0 + n**(-0.45) + 10.1 * (nx**(-1.7) + ny**(-1.7))) w = corr * w p = chi2.sf(w, r) return Epps_Singleton_2sampResult(w, p)
'behavioural_results.npz'), allow_pickle=True, encoding='bytes') as fi: snare_deviation_now = fi['snare_deviation'][snareInlier[idx]] wdBlk_deviation_now = fi['wdBlk_deviation'][wdBlkInlier[idx]] # take only the trials where performance is not nan snare_finite = np.isfinite(snare_deviation_now) wdBlk_finite = np.isfinite(wdBlk_deviation_now) snare_inlier_now = snare_finite #already filtered for snareInlier in line 41 and 96 wdBlk_inlier_now = wdBlk_finite # take only the trials in range median ± 1.5*IQR if iqr_rejection: lb_snare = np.median( snare_deviation_now[snare_finite]) - 1.5 * iqr( snare_deviation_now[snare_finite]) ub_snare = np.median( snare_deviation_now[snare_finite]) + 1.5 * iqr( snare_deviation_now[snare_finite]) idx_iqr_snare = np.logical_and(snare_deviation_now > lb_snare, snare_deviation_now < ub_snare) snare_inlier_now = np.logical_and(snare_finite, idx_iqr_snare) lb_wdBlk = np.median( wdBlk_deviation_now[wdBlk_finite]) - 1.5 * iqr( wdBlk_deviation_now[wdBlk_finite]) ub_wdBlk = np.median( wdBlk_deviation_now[wdBlk_finite]) + 1.5 * iqr( wdBlk_deviation_now[wdBlk_finite]) idx_iqr_wdBlk = np.logical_and(wdBlk_deviation_now > lb_wdBlk, wdBlk_deviation_now < ub_wdBlk) wdBlk_inlier_now = np.logical_and(wdBlk_finite, idx_iqr_wdBlk)
def __init__(self, data, channels, sf, hypno, href, preload, use_mne, downsample, kwargs_mne, annotations): """Init.""" # ========================== LOAD DATA ========================== # Dialog window if data is None : if data is None: data = dialog_load(self, "Open dataset", '', "Any EEG files (*.vhdr *.edf *.gdf *.bdf *.eeg " "*.egi *.mff *.cnt *.trc *.set *.rec);;" "BrainVision (*.vhdr);;EDF (*.edf);;" "GDF (*.gdf);;BDF (*.bdf);;Elan (*.eeg);;" "EGI (*.egi);;MFF (*.mff);;CNT (*.cnt);;" "Micromed (*.trc);;EEGLab (*.set);;REC (*.rec)") upath = os.path.split(data)[0] else: upath = '' if isinstance(data, str): # file is defined # ---------- USE SLEEP or MNE ---------- # Find file extension : file, ext = get_file_ext(data) # Force to use MNE if preload is False : use_mne = True if not preload else use_mne # Get if the file has to be loaded using Sleep or MNE python : sleep_ext = ['.eeg', '.vhdr', '.edf', '.trc', '.rec'] use_mne = True if ext not in sleep_ext else use_mne if use_mne: is_mne_installed(raise_error=True) # ---------- LOAD THE FILE ---------- if use_mne: # Load using MNE functions logger.debug("Load file using MNE-python") kwargs_mne['preload'] = preload args = mne_switch(file, ext, downsample, **kwargs_mne) else: # Load using Sleep functions logger.debug("Load file using Sleep") args = sleep_switch(file, ext, downsample) # Get output arguments : (sf, downsample, dsf, data, channels, n, offset, annot) = args info = ("Data successfully loaded (%s):" "\n- Sampling-frequency : %.2fHz" "\n- Number of time points (before down-sampling): %i" "\n- Down-sampling frequency : %.2fHz" "\n- Number of time points (after down-sampling): %i" "\n- Number of channels : %i" ) n_channels, n_pts_after = data.shape logger.info(info % (file + ext, sf, n, downsample, n_pts_after, n_channels)) PROFILER("Data file loaded", level=1) elif isinstance(data, np.ndarray): # array of data is defined if not isinstance(sf, (int, float)): raise ValueError("When passing raw data, the sampling " "frequency parameter, sf, must either be an " "integer or a float.") file = annot = None offset = datetime.time(0, 0, 0) dsf, downsample = get_dsf(downsample, sf) n = data.shape[1] data = data[:, ::dsf] else: raise IOError("The data should either be a string which refer to " "the path of a file or an array of raw data of shape" " (n_electrodes, n_time_points).") # Keep variables : self._file = file self._annot_file = np.c_[merge_annotations(annotations, annot)] self._N = n self._dsf = dsf self._sfori = float(sf) self._toffset = offset.hour * 3600. + offset.minute * 60. + \ offset.second time = np.arange(n)[::dsf] / sf self._sf = float(downsample) if downsample is not None else float(sf) # ========================== LOAD HYPNOGRAM ========================== # Dialog window for hypnogram : if hypno is None: hypno = dialog_load(self, "Open hypnogram", upath, "Text file (*.txt);;Elan (*.hyp);;" "CSV file (*.csv);;EDF+ file(*.edf);" ";All files (*.*)") hypno = None if hypno == '' else hypno if isinstance(hypno, np.ndarray): # array_like if len(hypno) == n: hypno = hypno[::dsf] else: raise ValueError("Then length of the hypnogram must be the " "same as raw data") if isinstance(hypno, str): # (*.hyp / *.txt / *.csv) hypno, _ = read_hypno(hypno, time=time, datafile=file) # Oversample then downsample : hypno = oversample_hypno(hypno, self._N)[::dsf] PROFILER("Hypnogram file loaded", level=1) # ========================== CHECKING ========================== # ---------- DATA ---------- # Check data shape : if data.ndim is not 2: raise ValueError("The data must be a 2D array") nchan, npts = data.shape # ---------- CHANNELS ---------- if (channels is None) or (len(channels) != nchan): warn("The number of channels must be " + str(nchan) + ". Default " "channel names will be used instead.") channels = ['chan' + str(k) for k in range(nchan)] # Clean channel names : patterns = ['eeg', 'EEG', 'ref'] chanc = [] for c in channels: # Remove informations after . : c = c.split('.')[0] c = c.split('-')[0] # Exclude patterns : for i in patterns: c = c.replace(i, '') # Remove space : c = c.replace(' ', '') c = c.strip() chanc.append(c) # ---------- STAGE ORDER ---------- # href checking : absref = ['art', 'wake', 'n1', 'n2', 'n3', 'rem'] absint = [-1, 0, 1, 2, 3, 4] if href is None: href = absref elif (href is not None) and isinstance(href, list): # Force lower case : href = [k.lower() for k in href] # Check that all stage are present : for k in absref: if k not in href: raise ValueError(k + " not found in href.") # Force capitalize : href = [k.capitalize() for k in href] href[href.index('Rem')] = 'REM' else: raise ValueError("The href parameter must be a list of string and" " must contain 'art', 'wake', 'n1', 'n2', 'n3' " "and 'rem'") # Conversion variable : absref = ['Art', 'Wake', 'N1', 'N2', 'N3', 'REM'] conv = {absint[absref.index(k)]: absint[i] for i, k in enumerate(href)} # ---------- HYPNOGRAM ---------- if hypno is None: hypno = np.zeros((npts,), dtype=np.float32) else: n = len(hypno) # Check hypno values : if (hypno.min() < -1.) or (hypno.max() > 4) or (n != npts): warn("\nHypnogram values must be comprised between -1 and 4 " "(see Iber et al. 2007). Use:\n-1 -> Art (optional)\n 0 " "-> Wake\n 1 -> N1\n 2 -> N2\n 3 -> N4\n 4 -> REM\nEmpty " "hypnogram will be used instead") hypno = np.zeros((npts,), dtype=np.float32) # ---------- SCALING ---------- # Assume that the inter-quartile amplitude of EEG data is ~50 uV iqr_chan = iqr(data[:, :int(data.shape[1] / 4)], axis=-1) bad_iqr = iqr_chan < 1. if np.any(bad_iqr): mult_fact = np.zeros_like(iqr_chan) iqr_chan[iqr_chan == 0.] = 1. mult_fact[bad_iqr] = np.floor(np.log10(50. / iqr_chan[bad_iqr])) data *= 10. ** mult_fact[..., np.newaxis] warn("Wrong channel data amplitude. ") # ---------- CONVERSION ----------= # Convert data and hypno to be contiguous and float 32 (for vispy): self._data = vispy_array(data) self._hypno = vispy_array(hypno) self._time = vispy_array(time) self._channels = chanc self._href = href self._hconv = conv PROFILER("Check data", level=1)
def evaluate_model_helper(df_input, choice, params, seed, scoring_metrics, eval_model, num_folds=10, eval_method="robust", verbose=0, cv_generator=None): """ Evaluate the model performance with the given parameters, cv and seed With the given metric list :param df_input: Pandas DataFrame, the original input dataset :param choice: String, one of "Metal", "Insulator", "MIT" :param params: Dictionary, the best parameters from hyperparameter tuning :param seed: Integer, the random seed for reproducibility :param scoring_metrics: List, a list of scoring metrics :param eval_model: sklearn model, the model to evaluate :param num_folds: Integer, the number of stratified folds (default: 10) :param eval_method: String, one of "robust", "standard" :param verbose: Int, if 1, print out the intermediate results :param cv_generator: Cross validator, if None will use stratified k-fold :return: Dictionary """ X_features, y_labels = load_data(df_input, choice) if eval_model.__name__ == "LogisticRegression": X_features = RobustScaler().fit_transform(X_features) fit_params_dict = None # if (multiclass & XGBClassifier) | GradientBoostingClassifier, specify the sample weights if ((choice == "Multiclass") and (eval_model.__name__ == "XGBClassifier")) or \ (eval_model.__name__ == "GradientBoostingClassifier"): fit_params_dict = { "sample_weight": compute_sample_weight(class_weight="balanced", y=y_labels) } # if cv_folds is not specified if not cv_generator: # initialize the stratified k-folds cv_generator = StratifiedKFold(n_splits=num_folds, shuffle=True, random_state=seed) # initialize the xgboost classifier with the tuned parameters model_to_eval = eval_model(**params[choice]) # evaluate the tuned model with stratified k-fold cv cv_scores = cross_validate(model_to_eval, X_features, y_labels, scoring=scoring_metrics, cv=cv_generator, error_score=np.nan, fit_params=fit_params_dict) if verbose == 1: print( "\nEvaluating the {label} vs. non-{label} binary classifier (seed={rand_seed})" .format(label=choice, rand_seed=seed)) if num_folds: print("For {} folds".format(num_folds)) if eval_method == "robust": printout_lst = [ "Median {}: {:0.2f} w/ IQR: {:0.2f}".format( metric, np.nanmedian(cv_scores["test_" + metric]), iqr(cv_scores["test_" + metric], nan_policy="omit")) for metric in scoring_metrics ] elif eval_method == "standard": printout_lst = [ "Mean {}: {:0.2f} w/ std: {:0.2f}".format( metric, np.nanmean(cv_scores["test_" + metric]), np.nanstd(cv_scores["test_" + metric])) for metric in scoring_metrics ] print(*printout_lst, sep="\n") print("-----------------------------------\n") return {metric: cv_scores["test_" + metric] for metric in scoring_metrics}
### Statistical significance of the results nruns = 1000 sep_limit = 0.02 p_median, p_mean = FragMent.Stat_Sig(nn_seps, "NNS", boundary, nruns, sep_limit) ks_stat, p_ks = FragMent.KS_test(nn_seps, "NNS", boundary, nruns, sep_limit) ad_stat, crit_vals, p_ad = FragMent.AD_test(nn_seps, "NNS", boundary, nruns, sep_limit) ### Report results print " " print " " print "######## Nearest neighbour results ########" print "The median and interquartile range of the distribution are: ", numpy.median( nn_seps), iqr(nn_seps) print "The p-value using the median-interquartile range NHT is : ", p_median print "The mean and standard deviation of the distribution are : ", numpy.mean( nn_seps), numpy.std(nn_seps) print "The p-value using the mean-standard deviation NHT is : ", p_mean print "The p-values from the K-S and A-D test are : ", p_ks, p_ad ### Perform a minimum spanning test on the data mst_seps, mst = FragMent.MST(pos) ### Statistical significance of the results p_median, p_mean = FragMent.Stat_Sig(mst_seps, "MST", boundary, nruns, sep_limit) ks_stat, p_ks = FragMent.KS_test(mst_seps, "MST", boundary, nruns, sep_limit) ad_stat, crit_vals, p_ad = FragMent.AD_test(mst_seps, "MST", boundary, nruns, sep_limit)
def find_iqr(dframe): return [int(iqr([v for k, v in enumerate(dframe.ix[i].values) if k != i])) for i in xrange(len(dframe))]
date_parser=lambda x: pd.to_datetime(float(x) + 28800000000000)) p = p.drop(columns=['name']) d = p['Press'].values l = p.index mean = np.mean(d) trimmean = stats.trim_mean(d, 0.2) median = np.median(d) meanv = np.array([np.mean(d)] * len(d)) trimmeanv = np.array([stats.trim_mean(d, 0.2)] * len(d)) medianv = np.array([np.median(d)] * len(d)) stdv = np.array([np.std(d)] * len(d)) iqrv = np.array([stats.iqr(d)] * len(d)) madv = np.array([stats.median_absolute_deviation(d)] * len(d)) print("std =", format(np.std(d), ".2f"), " iqr =", format(stats.iqr(d), ".2f"), " mad =", format(stats.median_absolute_deviation(d), ".2f")) mean_up = meanv + stdv mean_down = meanv - stdv median_up = medianv + iqrv median_down = medianv - iqrv mad_up = trimmeanv + madv mad_down = trimmeanv - madv #plt.figure(figsize=(10,7))
def performPrep(eeg, refChan, srate, linenoise, referenceType='robust'): dim = np.shape(eeg) if refChan != 0: eeg_chans = np.setdiff1d( range(0, dim[0]), refChan - 1) #remove the reference channel from the eeg channels eeg = eeg[eeg_chans, :] #finding bad channels #finding channels with NaNs or constant values for long periods of time org_dim = np.shape(eeg) originalChannels = np.arange(org_dim[0]) channelsInterpolate = originalChannels nanChannelMask = [False] * org_dim[0] noSignalChannelMask = [False] * org_dim[0] for i in range(0, org_dim[0]): nanChannelMask[i] = np.sum(np.isnan(eeg[i, :])) > 0 for i in range(0, org_dim[0]): noSignalChannelMask[i] = robust.mad(eeg[i, :]) < 10**(-10) or np.std( eeg[i, :]) < 10**(-10) badChannelsfromNans = channelsInterpolate[nanChannelMask] badChannelsfromNoData = channelsInterpolate[noSignalChannelMask] for i in range(0, org_dim[0]): if nanChannelMask[i] == True or noSignalChannelMask[i] == True: eeg = np.delete(eeg, i, axis=0) channelsInterpolate = np.setdiff1d( channelsInterpolate, np.union1d( badChannelsfromNans, badChannelsfromNoData)) #channels to be used for interpolation evaluationChannels = channelsInterpolate new_dim = np.shape(eeg) # find channels that have abnormally high or low amplitude robustchanneldeviation = np.zeros(org_dim[0]) badChannelFromDeviationMask = [False] * (new_dim[0]) channeldeviation = np.zeros(new_dim[0]) for i in range(0, new_dim[0]): channeldeviation[i] = 0.7413 * iqr(eeg[i, :]) channeldeviationSD = 0.7413 * iqr(channeldeviation) channeldeviationMedian = np.nanmedian(channeldeviation) robustchanneldeviation[evaluationChannels] = np.divide( np.subtract(channeldeviation, channeldeviationMedian), channeldeviationSD) for i in range(0, new_dim[0]): badChannelFromDeviationMask[i] = abs( robustchanneldeviation[i]) > 5 or np.isnan( robustchanneldeviation[i]) badChannelsfromDeviation = evaluationChannels[badChannelFromDeviationMask] #finding channels with high frequency noise if srate > 100: eeg = np.transpose(eeg) dim = np.shape(eeg) X = np.zeros((dim[0], dim[1])) B = filter_design(100, A=np.array([1, 1, 0, 0]), F=np.array([0, .36, 0.4, 1]), srate=250) for i in range(0, dim[1]): X[:, i] = signal.filtfilt(B, 1, eeg[:, i]) noisiness = np.divide(robust.mad(np.subtract(eeg, X)), robust.mad(X)) noisinessmedian = np.nanmedian(noisiness) noiseSD = np.median( np.absolute(np.subtract(noisiness, np.median(noisiness)))) * 1.4826 zscoreHFNoise = np.divide(np.subtract(noisiness, noisinessmedian), noiseSD) HFnoisemask = [False] * new_dim[0] for i in range(0, new_dim[0]): HFnoisemask[i] = zscoreHFNoise[i] > 5 or np.isnan(zscoreHFNoise[i]) else: X = eeg noisinessmedian = 0 noisinessSD = 1 zscoreHFNoise = np.zeros(dim[1], 1) badChannelsfromHFnoise = [] badChannelsfromHFnoise = evaluationChannels[HFnoisemask] #finding channels by correlation correlationSeconds = 1 # default value correlationFrames = correlationSeconds * srate correlationWindow = np.arange(correlationFrames) correlationOffsets = np.arange(1, dim[0] - correlationFrames, correlationFrames) Wcorrelation = len(correlationOffsets) maximumCorrelations = np.ones((org_dim[0], Wcorrelation)) drop_out = np.zeros((dim[1], Wcorrelation)) channelCorrelation = np.ones((Wcorrelation, dim[1])) noiselevels = np.zeros((Wcorrelation, dim[1])) channelDeviations = np.zeros((Wcorrelation, dim[1])) drop = np.zeros((Wcorrelation, dim[1])) n = len(correlationWindow) XWin = np.reshape(np.transpose(X[0:n * Wcorrelation, :]), (dim[1], n, Wcorrelation), order='F') dataWin = np.reshape(np.transpose(eeg[0:n * Wcorrelation, :]), (dim[1], n, Wcorrelation), order='F') for k in range(0, Wcorrelation): eegportion = np.transpose(np.squeeze(XWin[:, :, k])) dataportion = np.transpose(np.squeeze(dataWin[:, :, k])) windowCorrelation = np.corrcoef(np.transpose(eegportion)) abs_corr = np.abs( np.subtract(windowCorrelation, np.diag(np.diag(windowCorrelation)))) channelCorrelation[k, :] = np.quantile( abs_corr, 0.98, axis=0) # problem is here is solved noiselevels[k, :] = np.divide( robust.mad(np.subtract(dataportion, eegportion)), robust.mad(eegportion)) channelDeviations[k, :] = 0.7413 * iqr(dataportion, axis=0) for i in range(0, Wcorrelation): for j in range(0, dim[1]): drop[i, j] = np.int( np.isnan(channelCorrelation[i, j]) or np.isnan(noiselevels[i, j])) if drop[i, j] == 1: channelDeviations[i, j] = 0 noiselevels[i, j] = 0 maximumCorrelations[evaluationChannels, :] = np.transpose( channelCorrelation) drop_out[:] = np.transpose(drop) noiselevels_out = np.transpose(noiselevels) channelDeviations_out = np.transpose(channelDeviations) thresholdedCorrelations = maximumCorrelations < 0.4 thresholdedCorrelations = thresholdedCorrelations.astype(int) fractionBadCorrelationWindows = np.mean(thresholdedCorrelations, axis=1) fractionBadDropOutWindows = np.mean(drop_out, axis=1) badChannelsFromCorrelation = np.where(fractionBadCorrelationWindows > 0.01) badChannelsFromCorrelation_out = badChannelsFromCorrelation[:] badChannelsFromDropOuts = np.where(fractionBadDropOutWindows > 0.01) badChannelsFromDropOuts_out = badChannelsFromDropOuts[:] #medianMaxCorrelation = np.median(maximumCorrelations, 2); badChannelsfromSNR = np.union1d(badChannelsFromCorrelation_out, badChannelsfromHFnoise) noisyChannels = np.union1d( np.union1d( np.union1d( badChannelsfromDeviation, np.union1d(badChannelsFromCorrelation_out, badChannelsFromDropOuts_out)), badChannelsfromSNR), np.union1d(badChannelsfromNans, badChannelsfromNoData)) print(noisyChannels)
""" for col in num_cols: regents_df[col] = regents_df[col].apply(lambda x: float(x)) means = [np.mean(regents_df[col].astype(float)) for col in num_cols] regents_stats = pd.DataFrame(means, index=num_cols, columns=['col_mean']) regents_stats['stdev'] = [ np.std(regents_df[col].astype(float)) for col in num_cols ] regents_stats['col_median'] = [ np.median(regents_df[col].astype(float)) for col in num_cols ] regents_stats['iqr'] = [ st.iqr(regents_df[col].astype(float)) for col in num_cols ] regents_stats['five_num'] = [ np.percentile(regents_df[col].astype(float), [0, 25, 50, 75, 100]) for col in num_cols ] regents_stats['deciles'] = [ np.percentile(regents_df[col].astype(float), [10, 20, 30, 40, 50, 60, 70, 80, 90]) for col in num_cols ] print(regents_stats['col_median']) algebra_scores = regents_df[regents_df['exam_name'] == 'common core algebra'] means = [np.mean(algebra_scores[col].astype(float)) for col in num_cols] alg_stats = pd.DataFrame(means, index=num_cols, columns=['col_mean'])
def _get_features_for_vector(vec, prefix): # Prepare frequently used values _len = len(vec) _max = max(vec) _pos_max = vec.index(_max) _min = min(vec) _pos_min = vec.index(_min) _range = abs(_max - _min) _var = variance(vec) _std = stdev(vec) _mean = mean(vec) _mode = mode(vec, axis=None)[0][0] _median = median(vec) features = { f'{prefix} | MAX': _max, f'{prefix} | MIN': _min, # f'{prefix} | POSITION OF MAX': _pos_max, # TOO BIG # f'{prefix} | POSITION OF MIN': _pos_min, # TOO BIG f'{prefix} | RELATIVE POSITION OF MAX': safe_div(_pos_max, _len), f'{prefix} | RELATIVE POSITION OF MIN': safe_div(_pos_min, _len), f'{prefix} | RANGE': _range, f'{prefix} | RELATIVE RANGE': safe_div(_range, _max), f'{prefix} | RELATIVE VARIATION RANGE': safe_div(_range, _mean), f'{prefix} | INTERQUARTILE RANGE': iqr(vec), f'{prefix} | RELATIVE INTERQUARTILE RANGE': safe_div(iqr(vec), _max), f'{prefix} | INTERDECILE RANGE': quantile(vec, 0.9) - quantile(vec, 0.1), f'{prefix} | RELATIVE INTERDECILE RANGE': safe_div(quantile(vec, 0.9) - quantile(vec, 0.1), _max), f'{prefix} | INTERPERCENTILE RANGE': quantile(vec, 0.99) - quantile(vec, 0.01), f'{prefix} | RELATIVE INTERPERCENTILE RANGE': safe_div(quantile(vec, 0.99) - quantile(vec, 0.01), _max), f'{prefix} | STUDENTIZED RANGE': safe_div(_range, _var), f'{prefix} | MEAN': _mean, # f'{prefix} | GEOMETRIC MEAN': gmean(vec), # always NaN # f'{prefix} | HARMONIC MEAN': harmonic_mean(vec), # harmonic mean does not support negative values f'{prefix} | MEAN EXCLUDING OUTLIERS (10)': trim_mean(vec, 0.1), f'{prefix} | MEAN EXCLUDING OUTLIERS (20)': trim_mean(vec, 0.2), f'{prefix} | MEAN EXCLUDING OUTLIERS (30)': trim_mean(vec, 0.3), f'{prefix} | MEAN EXCLUDING OUTLIERS (40)': trim_mean(vec, 0.4), f'{prefix} | MEAN EXCLUDING OUTLIERS (50)': trim_mean(vec, 0.5), f'{prefix} | MEDIAN': _median, f'{prefix} | MODE': _mode, f'{prefix} | VARIANCE': _var, f'{prefix} | STANDARD DEVIATION': _std, f'{prefix} | MEDIAN ABSOLUTE DEVIATION': median_absolute_deviation(vec), # f'{prefix} | GEOMETRIC STANDARD DEVIATION': gstd(vec), # The geometric standard deviation is defined for # strictly positive values only. f'{prefix} | RELATIVE STANDARD DEVIATION': safe_div(_std, _mean), f'{prefix} | INDEX OF DISPERSION': safe_div(_var, _mean), # f'{prefix} | 3rd MOMENT': moment(_var, 3), always 0 # f'{prefix} | 4th MOMENT': moment(_var, 4), always 0 # f'{prefix} | 5th MOMENT': moment(_var, 5), always 0 # f'{prefix} | 6th MOMENT': moment(_var, 6), always 0 f'{prefix} | KURTOSIS': kurtosis(vec), f'{prefix} | SKEWNESS': skew(vec), f'{prefix} | PEARSONS 1st SKEWNESS COEFFICIENT': safe_div((3 * (_mean - _mode)), _std), f'{prefix} | PEARSONS 2nd SKEWNESS COEFFICIENT': safe_div(3 * (_mean - _median), _std), f'{prefix} | 1st PERCENTILE': percentile(vec, 1), f'{prefix} | 5th PERCENTILE': percentile(vec, 5), f'{prefix} | 10th PERCENTILE': percentile(vec, 10), f'{prefix} | 20th PERCENTILE': percentile(vec, 20), f'{prefix} | 1st QUARTILE': percentile(vec, 25), f'{prefix} | 30th PERCENTILE': percentile(vec, 30), f'{prefix} | 40th PERCENTILE': percentile(vec, 40), f'{prefix} | 60th PERCENTILE': percentile(vec, 60), f'{prefix} | 70th PERCENTILE': percentile(vec, 70), f'{prefix} | 3th QUARTILE': percentile(vec, 75), f'{prefix} | 80th PERCENTILE': percentile(vec, 80), f'{prefix} | 90th PERCENTILE': percentile(vec, 90), f'{prefix} | 95th PERCENTILE': percentile(vec, 95), f'{prefix} | 99th PERCENTILE': percentile(vec, 99), # f'{prefix} | SHANNON ENTROPY': entropy(vec), # MAKE NO SENSE HERE # f'{prefix} | MODULATION': _range / (_max + _min) # MAKE NO SENSE HERE } return features
def trend(dow, intersection, direction, int_leg, new_dataframe, iqr_multiplier): ## trend: str str str str str => matplotlibplot ## requires: dow, intersection, direction, int_leg: strings as used in all previous functions ## new_dataframe: pd.DataFrame containing two columns- 'datetime_bin' and 'volume' ## The format of this dataframe is the same as the one returned by ## the grab function # grab data data = grab(dow, intersection, direction, int_leg) data['datetime_bin'] = pd.to_datetime(data['datetime_bin']) intervals = len( data.groupby(data['datetime_bin'].dt.strftime('%d')).count() ['datetime_bin']) #number of periods rdata = ts(append(data.volume.values, new_dataframe.volume.values), frequency=96) # decompose rstring = """function(testdata){ library(forecast) decomp <- stl(testdata, s.window = 'periodic') outdf<-as.data.frame(decomp$time.series) outdf }""" rfunc = robjects.r(rstring) r_df = rfunc(rdata) decomp_as_df = pandas2ri.ri2py(r_df) trendvalues = decomp_as_df['trend'] #all data including new data oldtrend = trendvalues[0:len(data) - 1] #old data # Create bounds (via scipy.stats.iqr and numpy.percentile) pct = [percentile(oldtrend, 25), percentile(oldtrend, 75)] #25th percentile and 75th iqrange = (iqr(oldtrend)) lower_bound = pct[0] - (iqrange * iqr_multiplier) upper_bound = pct[1] + (iqrange * iqr_multiplier) # if more than a quarter of the new data sits outside the bounds, do the following: if list(lower_bound <= trendvalues[len(data):]).count(False) >= 0.25 * len( new_dataframe) or list(upper_bound >= trendvalues[len(data):] ).count(False) >= 0.25 * len(new_dataframe): # Plot Data With Bounds and data cutoff plt.ioff() plt.figure(figsize=(18, 10)) plt.plot(trendvalues, linewidth=2, color='blue', alpha=0.7, label='Trend Volume') plt.axvline(x=(96 * intervals) - 1, c='#FF00FF', linewidth=4, alpha=0.7, linestyle='--', label='New Data Cutoff') # data cut off point plt.axhline(lower_bound, alpha=0.5, color='c') #lower bound plt.axhline(upper_bound, alpha=0.5, color='c') #upper bound plt.axhspan(lower_bound, upper_bound, alpha=0.1, facecolor='c', label='Trend Bounds') #spread plt.title("%s Trendline with New Data (%s Leg, %s)" % (intersection, int_leg, direction)) plt.rc('font', **font) plt.ylabel("Volume Trend") plt.legend() g.trend_graph_count += 1 #update graph count plt.savefig(path + '\\trend_%s.png' % (g.trend_graph_count), dpi=300)
def find_bad_by_correlation(self, correlation_secs=1.0, correlation_threshold=0.4, frac_bad=0.1): """Find correlation between the low frequency components of the EEG below 50 Hz. Correlation is done using a sliding non-overlapping time window. The maximum absolute correlation is as the 98th percentile of the absolute values of the correlations with the other channels If the maximum correlation is less than 0.4 then the channel is designated as bad by corre- lation. Parameters ---------- correlation_secs: float length of the correlation time window (default: 1 secs). correlation_threshold: float correlation threshold below which channel is marked bad. frac_bad: float percentage of data windows in which the correlation threshold was not surpassed and if a channel gets a value of greater than 1%, it is designated bad. """ self.find_bad_by_hfnoise() # since filtering is performed there correlation_frames = correlation_secs * self.sample_rate correlation_window = np.arange(correlation_frames) correlation_offsets = np.arange( 1, (self.new_dimensions[1] - correlation_frames), correlation_frames) w_correlation = len(correlation_offsets) maximum_correlations = np.ones( (self.original_dimensions[0], w_correlation)) drop_out = np.zeros((self.new_dimensions[0], w_correlation)) channel_correlation = np.ones((w_correlation, self.new_dimensions[0])) noiselevels = np.zeros((w_correlation, self.new_dimensions[0])) channel_deviations = np.zeros((w_correlation, self.new_dimensions[0])) drop = np.zeros((w_correlation, self.new_dimensions[0])) len_correlation_window = len(correlation_window) EEGData = np.transpose(self.EEGData) EEG_new_win = np.reshape( np.transpose(EEGData[0:len_correlation_window * w_correlation, :]), (self.new_dimensions[0], len_correlation_window, w_correlation), order="F", ) data_win = np.reshape( np.transpose(self.EEGData_beforeFilt[0:len_correlation_window * w_correlation, :]), (self.new_dimensions[0], len_correlation_window, w_correlation), order="F", ) for k in range(0, w_correlation): eeg_portion = np.transpose(np.squeeze(EEG_new_win[:, :, k])) data_portion = np.transpose(np.squeeze(data_win[:, :, k])) window_correlation = np.corrcoef(np.transpose(eeg_portion)) abs_corr = np.abs( np.subtract(window_correlation, np.diag(np.diag(window_correlation)))) channel_correlation[k, :] = np.quantile(abs_corr, 0.98, axis=0) noiselevels[k, :] = np.divide( robust.mad(np.subtract(data_portion, eeg_portion), c=1), robust.mad(eeg_portion, c=1), ) channel_deviations[k, :] = 0.7413 * iqr(data_portion, axis=0) for i in range(0, w_correlation): for j in range(0, self.new_dimensions[0]): drop[i, j] = np.int( np.isnan(channel_correlation[i, j]) or np.isnan(noiselevels[i, j])) if drop[i, j] == 1: channel_deviations[i, j] = 0 noiselevels[i, j] = 0 maximum_correlations[self.channels_interpolate, :] = np.transpose( channel_correlation) drop_out[:] = np.transpose(drop) thresholded_correlations = maximum_correlations < correlation_threshold thresholded_correlations = thresholded_correlations.astype(int) fraction_BadCorrelationWindows = np.mean(thresholded_correlations, axis=1) fraction_BadDropOutWindows = np.mean(drop_out, axis=1) bad_correlation_channels_idx = np.argwhere( fraction_BadCorrelationWindows > frac_bad) bad_correlation_channels_name = self.ch_names_original[ bad_correlation_channels_idx.astype(int)] self.bad_by_correlation = [i[0] for i in bad_correlation_channels_name] dropout_channels_idx = np.argwhere( fraction_BadDropOutWindows > frac_bad) dropout_channels_name = self.ch_names_original[ dropout_channels_idx.astype(int)] self.bad_by_dropout = [i[0] for i in dropout_channels_name] return None
df.head() # In[ ]: df[f_num] # In[ ]: from scipy import stats IQR=[] for i in f_num: IQR.append(stats.iqr(df[i], interpolation = 'midpoint')) IQR # In[ ]: limits = dict() j=0 for i in f_num: Q1 = np.percentile(df[i], 25, interpolation = 'midpoint') Q3 = np.percentile(df[i], 75, interpolation = 'midpoint') #print(Q1, Q3) limits[i] = [Q1-(1.5*IQR[j]), Q3+(1.5*IQR[j])] j+=1