def summary(self): """summary basic statistic for identified subnetwork""" print str(len(self.netdict))+' subnetwork generated:' n, (smin, smax), sm, sv, ss, sk = describe([self.netdict[key][0] for key in self.netdict]) print 'Subnet MI socre range ['+str(smin)+', '+str(smax)+'] of mean '+str(sm)+' and var '+str(sv) n, (smin, smax), sm, sv, ss, sk = describe([len(self.netdict[key][1]) for key in self.netdict]) print 'Subnet nodes size ['+str(smin)+', '+str(smax)+'] of mean '+str(int(sm))+' and var '+str(int(sv))
def calc_agreements(nr_of_abstracts=150): # Loop over the abstracts and calculate the kappa and alpha per abstract aggregate = [] for i in range(0, nr_of_abstracts): # try: annotators = round_robin(i) annotations_A = flatten(get_annotations(i, annotators[0])) annotations_B = flatten(get_annotations(i, annotators[1])) annotations = __str_combine_annotations(annotations_A, annotations_B) a = AnnotationTask(annotations, agreement_fn) aggregate.append({ "kappa" : a.kappa(), "alpha" : a.alpha(), "annotator_A" : annotators[0], "annotator_B" : annotators[1] }) # except: # print("Could not calculate kappa for abstract %i" % (i + 1)) # pass # Summary statistics kappa = describe([a['kappa'] for a in aggregate]) print("number of abstracts %i" % kappa[0]) print("[kappa] mean: " + str(kappa[2])) print("[kappa] variance: " + str(kappa[3])) alpha = describe([a['alpha'] for a in aggregate]) print("[alpha] mean: " + str(alpha[2])) print("[alpha] variance: " + str(alpha[3]))
def hipgaleq(): """Print summary of GAL-EQ comparison with SLALIB galeq (HIP).""" hip_tab = get_hipdata() sla_tab = get_sla("slalib_hip_galeq.txt") dummy = np.zeros((len(hip_tab['px']),)) v6l = convert.cat2v6(hip_tab['glon'], hip_tab['glat'], dummy, dummy, dummy, dummy, tpm.CJ) # The actual epoch of galactic data is J2000. But in SLALIB # the input is taken to be B1950.0. So use tpm.B1950 as epoch # in the conversion. v6o = convert.convertv6(v6l, s1=4, s2=6, epoch=tpm.B1950) cat = convert.v62cat(v6o, tpm.CJ) cat = cat2array(cat) ra_diff = np.degrees(cat['alpha']) - sla_tab[:, 0] ra_diff = np.abs(ra_diff * 3600.0) dec_diff = np.degrees(cat['delta']) - sla_tab[:, 1] dec_diff = np.abs(dec_diff * 3600.0) print("Comparison with SLALIB galeq using HIPPARCOS data.") fs = "{0} {1}\n" + \ "Min: {2:.4f} Max: {3:.4f} \nMean: {4:.4f} Std: {5:.4f}\n" x = stats.describe(ra_diff) print(fs.format("ra_diff", "arcsec", x[1][0], x[1][1], x[2], x[3] ** 0.5)) x = stats.describe(dec_diff) print(fs.format("dec_diff", "arcsec", x[1][0], x[1][1], x[2], x[3] ** 0.5))
def meta_data(): totin = 0 totout = 0 inqual = 0 outqual = 0 indist = list() outdist = list() for count, qual in zip(open('%s/clpair.numturkers'%DICT_DIR).readlines(), open('%s/clpair.turkerqual'%DICT_DIR).readlines()): lang, num = count.strip().split('\t') lang, score = qual.strip().split('\t') num = int(float(num.strip())) score = float(score.strip()) indist += [score] * num totin += num inqual += (num*score) for count, qual in zip(open('%s/nonclpair.numturkers'%DICT_DIR).readlines(), open('%s/nonclpair.turkerqual'%DICT_DIR).readlines()): lang, num = count.strip().split('\t') lang, score = qual.strip().split('\t') num = int(float(num.strip())) score = float(score.strip()) outdist += [score] * num totout += num outqual += (num*score) i_n, (i_min, i_max), i_m, i_v, i_s, i_k = stats.describe(indist) i_moe = math.sqrt(i_v)/math.sqrt(i_n) * 2.576 o_n, (o_mon, o_max), o_m, o_v, o_s, o_k = stats.describe(outdist) o_moe = math.sqrt(o_v)/math.sqrt(o_n) * 2.576 print 'In region: %d Turkers, Avg. score %0.3f (%0.3f, %.03f)'%(i_n, i_m, i_m - i_moe, i_m + i_moe) print 'Out of region: %d Turkers, Avg. score %0.3f (%0.3f, %.03f)'%(o_n, o_m, o_m - o_moe, o_m + o_moe)
def hipecleq(): """Print summary of ECL-EQ comparison with SLALIB ecleq (HIP).""" hip_tab = get_hipdata() sla_tab = get_sla("slalib_hip_ecleq.txt") dummy = np.zeros((len(hip_tab['px']),)) v6l = convert.cat2v6(hip_tab['elon2'], hip_tab['elat2'], dummy, dummy, dummy, dummy, tpm.CJ) v6o = convert.convertv6(v6l, s1=3, s2=6) cat = convert.v62cat(v6o, tpm.CJ) cat = cat2array(cat) ra_diff = np.degrees(cat['alpha']) - sla_tab[:, 0] ra_diff = np.abs(ra_diff * 3600.0) dec_diff = np.degrees(cat['delta']) - sla_tab[:, 1] dec_diff = np.abs(dec_diff * 3600.0) print("Comparison with SLALIB ecleq using HIPPARCOS data.") fs = "{0} {1}\n" + \ "Min: {2:.4f} Max: {3:.4f} \nMean: {4:.4f} Std: {5:.4f}\n" x = stats.describe(ra_diff) print(fs.format("ra_diff", "arcsec", x[1][0], x[1][1], x[2], x[3] ** 0.5)) x = stats.describe(dec_diff) print(fs.format("dec_diff", "arcsec", x[1][0], x[1][1], x[2], x[3] ** 0.5))
def hipeqgal(): """Print summary of EQ-GAL comparison with SLALIB eqgal (HIP).""" hip_tab = get_hipdata() sla_tab = get_sla("slalib_hip_eqgal.txt") dummy = np.zeros((len(hip_tab['px']),)) v6l = convert.cat2v6(hip_tab['raj2'], hip_tab['decj2'], dummy, dummy, dummy, dummy, tpm.CJ) v6o = convert.convertv6(v6l, s1=6, s2=4) # The galactic coordinates are at epoch J2000. But SLALIB # results are for B1950. So apply proper motion here. v6o = convert.proper_motion(v6o, tpm.B1950, tpm.J2000) cat = convert.v62cat(v6o, tpm.CJ) cat = cat2array(cat) ra_diff = np.degrees(cat['alpha']) - sla_tab[:, 0] ra_diff = np.abs(ra_diff * 3600.0) dec_diff = np.degrees(cat['delta']) - sla_tab[:, 1] dec_diff = np.abs(dec_diff * 3600.0) print("Comparison with SLALIB eqgal using HIPPARCOS data.") fs = "{0} {1}\n" + \ "Min: {2:.4f} Max: {3:.4f} \nMean: {4:.4f} Std: {5:.4f}\n" x = stats.describe(ra_diff) print(fs.format("ra_diff", "arcsec", x[1][0], x[1][1], x[2], x[3] ** 0.5)) x = stats.describe(dec_diff) print(fs.format("dec_diff", "arcsec", x[1][0], x[1][1], x[2], x[3] ** 0.5))
def statsAnalysis( self ): ge_arr = numpy.array( self.geneexpdict.values() ) descstats = stats.describe( ge_arr ) # descriptive statistics for the log fold change values: size of array, (min,max), mean, var, skewness, kurtosis print descstats raw_avg_logfc = numpy.mean( ge_arr ); raw_stdev_logfc = numpy.std( ge_arr ); print "raw mean and sd: ", raw_avg_logfc, raw_stdev_logfc; stats.probplot( ge_arr, plot=matplotlib.pyplot ) matplotlib.pyplot.savefig('qqplot_raw.png') matplotlib.pyplot.close(); # if the distribution is not central, the n and nn labels could be assigned to genes with > 0 log(fc). To avoid this, convert gene exp values to z-scores and recalculate mean and sd for k in self.geneexpdict.keys(): v = self.geneexpdict[k]; zscore = (v - raw_avg_logfc)/float(raw_stdev_logfc); self.geneexpdict[k] = zscore; # recompute distribution parameters ge_arr = numpy.array( self.geneexpdict.values() ) descstats = stats.describe( ge_arr ) # descriptive statistics for the log fold change values: size of array, (min,max), mean, var, skewness, kurtosis print descstats self.avg_logfoldchange = numpy.mean( ge_arr ); self.stdev_logfoldchange = numpy.std( ge_arr ); print "centralized mean and sd: ", self.avg_logfoldchange, self.stdev_logfoldchange; stats.probplot( ge_arr, plot=matplotlib.pyplot ) matplotlib.pyplot.savefig('qqplot_centralized.png') matplotlib.pyplot.close();
def anova(lists): base = lists['all'] print 'all', stats.describe(base) for l in lists: if not l == 'all': print l, stats.describe(lists[l]) print stats.f_oneway(base, lists[l])
def main(): here = os.path.dirname(os.path.realpath(__file__)) filenames = ['cumul_HeLa-S3_Cytoplasm.stat', 'cumul_HeLa-S3_Nucleus.stat', 'cumul_HeLa-S3_Whole_Cell.stat'] #filenames = ['cumul_K562_Nucleus.stat'] for filename in filenames: tuning_file = os.path.join(os.path.split(here)[0], 'output', filename) #tuning_file = os.path.join(os.path.split(here)[0], 'output', 'test.stat') contents = {0:'cumul', 1:'pA_to_cumul_dist', 2:'pA_cumul', 3: 'd_stream_covr', 4: 'u_stream_covr', 5: 'rpkm', 6:'utr_length', 7: 'strand'} tuning_handle = open(tuning_file, 'rb') header = tuning_handle.next().split() # Get the stats dictionary data = {} for line in tuning_handle: (utr_id, default_cumul, pA_to_cumul_dist, pA_cumul, d_stream_covr, u_stream_covr, rpkm, utr_length, strand) = line.split() if float(rpkm) < 2: continue data[utr_id] = (int(default_cumul), int(pA_to_cumul_dist), float(pA_cumul), float(d_stream_covr), float(u_stream_covr), float(rpkm), int(utr_length), strand) # Print the distribution of pA_cumul # Print the mean and the standard deviation as well # TODO wait for the calculation to finish. Then look at the mean and std and # maybe plot as well. For now, I move on! # AS WELL! Get a measure on how good your changes are: get the mean of the # distances from the cut-off to the actual polyas. This distance should # decrease with each iteration. # The relative cumulative length of the pA clusters pA_cumuls = [vals[2] for vals in data.itervalues()] (n_cumul, min_max_cumul, mean_cumul, var_cumul) = stats.describe(pA_cumuls)[:4] print filename, mean_cumul, var_cumul # The before/after coverage ratio of the pA clusters beg_aft = [math.log(vals[3]/vals[4], 2) for vals in data.itervalues() if vals[4]!=0] (n_ratio, min_max_ratio, mean_ratio, var_ratio) = stats.describe(beg_aft)[:4] print filename, mean_ratio, var_ratio box_plot(beg_aft, filename) fig = plt.figure() ax = fig.add_subplot(111) ax.hist(beg_aft, bins=30) ax.set_title(filename) plt.show()
def summary(self): """summary basic statistic for identified subnetwork""" print str(len(self.netdict))+' subnetwork generated:' n, (smin, smax), sm, sv, ss, sk = describe([self.netdict[key][0] for key in self.netdict]) print 'Subnet socre ['+str(smin)+', '+str(smax)+'] of mean '+str(sm)+' and var '+str(sv) n, (smin, smax), sm, sv, ss, sk = describe([len(self.netdict[key][1].nodes()) for key in self.netdict]) print 'Subnet nodes size ['+str(smin)+', '+str(smax)+'] of mean '+str(int(sm))+' and var '+str(int(sv)) counter = Counter(self.depth) print 'Subnet depth summary:' for each in sorted(counter.keys()): print 'depth '+str(each)+': '+str(counter[each])
def print_statistics(self): from scipy import stats sz, (mn, mx), avg, var,skew, kurt = stats.describe(self.score1) s1_stats = (sz, mn, mx, avg, var,skew, kurt)[:5] sz, (mn, mx), avg, var,skew, kurt = stats.describe(self.score2) s2_stats = (sz, mn, mx, avg, var,skew, kurt)[:5] stat_string = ("\n\tTrace 1 \t Trace2\n" + '-'*40 + '\n' + "Length \t %d \t\t %d \n" % (s1_stats[0], s2_stats[0]) + "Min \t %f \t %f \n" % (s1_stats[1], s2_stats[1]) + "Max \t %f \t %f \n" % (s1_stats[2], s2_stats[2]) + "Average %f \t %f \n" % (s1_stats[3], s2_stats[3]) + "Variance %f \t %f \n" % (s1_stats[4], s2_stats[4]) ) print stat_string
def print_statistics(a1, a2): sta1 = scs.describe(a1) sta2 = scs.describe(a2) print "%14s %14s %14s" % ("statistic", "data set 1", "data set 2") print 45 * "-" print "%14s %14.3f %14.3f" % ("size", sta1[0], sta2[0]) print "%14s %14.3f %14.3f" % ("min", sta1[1][0], sta2[1][0]) print "%14s %14.3f %14.3f" % ("max", sta1[1][1], sta2[1][1]) print "%14s %14.3f %14.3f" % ("mean", sta1[2], sta2[2]) print "%14s %14.3f %14.3f" % ("std", np.sqrt(sta1[3]), np.sqrt(sta2[3])) print "%14s %14.3f %14.3f" % ("skew", sta1[4], sta2[4]) print "%14s %14.3f %14.3f" % ("kurtosis", sta1[5], sta2[5])
def eda_var(df, var, units): #statistical summary print("Statistical Summary of " + var + ".") print(df[var].describe()) print("Distribution Analysis " + var + ".") st.describe(df[var]) #requency dsitributions print("Frequency distribution of " + var +".") print( df[var].value_counts().sort_values()) print("Normalized frequency distribution of " + var + ".") print(df[var].value_counts( normalize=True)) #eda_plot(df,var, units) dist_plot(usage,var)
def make_density(data,**kwargs): mat=array(data) if mat.shape[1]==2 : #data is couples (x,y) mat=mat.T normed=kwargs.get('normed',False) nbin=kwargs.get('bins',100) logy=kwargs.get('logy',False) remove0=1-kwargs.get('include_zeroes',1) bintype='linear' xmin,xmax=np.min(mat[0]),np.max(mat[0]) if kwargs.get('logx',False): numat=mat[0][mat[0]>10**(-100)] xmin,xmax=min(nozero(nonan(numat))),max(nonan(numat)) bintype='log' bins=np.logspace(log10(xmin),log10(xmax),nbin,False) binw=[bins[i+1]-bins[i] for i in xrange(nbin-1)] binw.append(xmax-bins[-1]) xspan=xmax-xmin try: bins except: #linear spacing case binw=xspan/nbin bins=np.linspace(xmin,xmax,nbin,False) binnage=[[] for i in xrange(nbin)] for x,y in mat.T : if x<xmin or x>xmax: continue if bintype=='linear': xbin=int(floor(float(x-xmin)/binw)) else : xbin=bisct.bisect_left(bins,x) if xbin ==nbin: #maxvalue xbin=nbin-1 if remove0 and abs(y)>10**(-40): binnage[xbin].append(y) res=array([stats.describe(i)[2:]+(min(i),max(i)) if i else stats.describe([0])[2:]+(0,0) for i in binnage]) sspercen=scipy.stats.scoreatpercentile if kwargs.get('relative',1): quantile=array([array([sspercen(i,50),sspercen(i,50)-sspercen(i,5),sspercen(i,95)-sspercen(i,50)]) if i else array([0,0,0]) for i in binnage]) res2=array([-res[:,-2]+res[:,0],res[:,-1]-res[:,0]]) else: quantile=array([array([sspercen(i,50),sspercen(i,5),sspercen(i,95)]) if i else array([0,0,0]) for i in binnage]) res2=array([res[:,-2],res[:,-1]]) quantile=quantile.T if normed : if bintype=='linear': res[:,0]/=sum(res[:,0])*binw else : res[:,0]/=sum(np.dot(res[:,0],binw)) return bins,res[:,0],res[:,1],res2,quantile[0],quantile[1:],array([len(i) for i in binnage])
def describe_out(self): stat_train = pd.DataFrame(columns=['Min', 'Max', 'Mean', 'Median', 'SD', 'Skew', 'Kurt']) for i in range(self.features_index[0], self.returns_next_days_index[1]+1): data = self.train_data.iloc[:,i] n, min_max, mean, var, skew, kurt = stats.describe(data) stat_train.loc[self.train_data.columns[i]] = [min_max[0], min_max[1], mean, data.median(), scipy.sqrt(var), skew, kurt] stat_train.to_csv('../data/stat_train.csv') stat_test = pd.DataFrame(columns=['Min', 'Max', 'Mean', 'Median', 'SD', 'Skew', 'Kurt']) for i in range(self.features_index[0], self.returns_predict_index[0]): data = self.test_data.iloc[:,i] n, min_max, mean, var, skew, kurt = stats.describe(data) stat_test.loc[self.test_data.columns[i]] = [min_max[0], min_max[1], mean, data.median(), scipy.sqrt(var), skew, kurt] stat_test.to_csv('../data/stat_test.csv')
def gen_feature_mfcc(x, debug=False): d = len(x[0]) if debug: print (" - - entering gen_feature_mfcc") v = x[:,0:d-1] # the last column is 0, ignore it if np.isnan(v).any() or np.isinf(v).any(): raise Exception('MFCC contains Nan of Inf') xn, (xmin, xmax), xmean, xvar, xskew, xkurt = stats.describe(v) if debug: print stats.describe(v) d = np.diff(v, 1, 0) dmean = np.mean(d, 0) x = np.concatenate((xmean,xmin,xmax,xvar,dmean)) if np.isnan(x).any() or np.isinf(x).any(): raise Exception('Feature vector contains Nan of Inf') return x
def simulate_seasons(N, start_week, players): winner_equity = defaultdict(float) last_weeks = [] for i in range(N): if i % 1000 == 0: print i last_week, winners = simulate_season(start_week, players) for winner in winners: winner_equity[winner] += 1. / len(winners) last_weeks.append(last_week) pp({p: "%.2f" % (100 * e / N) for p, e in winner_equity.iteritems()}) print stats.describe(last_weeks) print stats.histogram(last_weeks)
def _init_fld2val(self, name, vals): """Describe summary statistics for a list of numbers.""" #pylint: disable=no-member vals_stats = stats.describe(vals) stddev = math.sqrt(vals_stats[3]) # stats variance p25 = np.percentile(vals, 25) p50 = np.percentile(vals, 50) # median p75 = np.percentile(vals, 75) fld2val = { 'name':name, 'qty'.format(ITEMS=self.desc):vals_stats[0], # stats nobs 'range':self._get_str_range(vals_stats), '25th percentile':p25, 'median':p50, '75th percentile':p75, 'mean':vals_stats[2], # stats mean 'stddev':stddev} fmtflds = set(['25th percentile', 'median', '75th percentile', 'mean', 'stddev']) mkint = "," in self.fmtstr for key, val in fld2val.items(): if key in fmtflds: if mkint: val = int(round(val)) val = self.fmtstr.format(val) fld2val[key] = val return fld2val
def processAlgorithm(self, progress): layer = QGisLayers.getObjectFromUri(self.getParameterValue(self.INPUT_LAYER)) valuesFieldName = self.getParameterValue(self.VALUES_FIELD_NAME) categoriesFieldName = self.getParameterValue(self.CATEGORIES_FIELD_NAME) output = self.getOutputFromName(self.OUTPUT) valuesField = layer.fieldNameIndex(valuesFieldName) categoriesField = layer.fieldNameIndex(categoriesFieldName) features = QGisLayers.features(layer) nFeat = len(features) values = {} for feat in features: attrs = feat.attributes() value = float(attrs[valuesField].toDouble()[0]) cat = unicode(attrs[categoriesField].toString()) if cat not in values: values[cat] = [] values[cat].append(value) fields = [QgsField("category", QVariant.String), QgsField("mean", QVariant.Double), QgsField("variance", QVariant.Double)] writer = output.getTableWriter(fields) for cat, value in values.items(): n, min_max, mean, var, skew, kurt = stats.describe(value) record = [cat, mean, math.sqrt(var)] writer.addRecord(record)
def main(): parser = OptionParser(usage="usage: %prog [options] file", version="%prog 0.1") # parser.add_option("-t", "--template", # action="store", type="string", dest="template", # help="declare output format") parser.add_option( "-s", "--separator", action="store", type="string", dest="separator", help="seperator in the output file [defualt = ' ']", default=" ", ) (options, args) = parser.parse_args() if len(args) != 1: parser.error("incorrect number of arguments") data = np.loadtxt(args[0]) # with open(args[0]) as fd: # data2 = np.fromfile(fd, sep='\n', dtype=float) # print(data2) # df = pd.DataFrame(data) # print(data) # print(df) # numpy mean = data.mean() med = np.median(data) var = data.var() sdev = data.std() delta = confidence_interval(data) # pandas """ mean = df.mean() med = df.median() var = df.var() sdev = df.std() """ # scipy stats sem = stats.sem(data) # stand error of mean #''' print("mean : {}".format(mean)) print("median : {}".format(med)) print("variance : {}".format(var)) print("standard deviation: {}".format(sdev)) print("stats.sem : {}".format(sem)) print("conf. interval : {}".format(delta)) #''' # print('{0}{1}{2}'.format(mean[0], options.separator, sdev[0])) dn, dmin_max, dmean, dvar, dskew, dkurt = stats.describe(data) dstd = math.sqrt(dvar)
def examples_normexpand(): skewnorm = SkewNorm_gen() rvs = skewnorm.rvs(5, size=100) normexpan = NormExpan_gen(rvs, mode="sample") smvsk = stats.describe(rvs)[2:] print "sample: mu,sig,sk,kur" print smvsk dmvsk = normexpan.stats(moments="mvsk") print "normexpan: mu,sig,sk,kur" print dmvsk print "mvsk diff distribution - sample" print np.array(dmvsk) - np.array(smvsk) print "normexpan attributes mvsk" print mc2mvsk(normexpan.cnt) print normexpan.mvsk mc, mnc = mvsk2m(dmvsk) print "central moments" print mc print "non-central moments" print mnc pdffn = pdf_moments(mc) print "\npdf approximation from moments" print "pdf at", mc[0] - 1, mc[0] + 1 print pdffn([mc[0] - 1, mc[0] + 1]) print normexpan.pdf([mc[0] - 1, mc[0] + 1])
def topic_entropy(self): # Get stats for the whole corpus (sample) cstats = {} sql1 = "SELECT avg(topic_entropy), max(topic_entropy), min(topic_entropy) FROM doctopic" sql2 = "INSERT INTO corpusstats (mean_topic_entropy,max_topic_entropy,min_topic_entropy) VALUES (?,?,?)" for r in self.curin.execute(sql1): cstats['avg_h'], cstats['max_h'], cstats['min_h'] = r self.curout.execute(sql2,r) self.connout.commit() # Get stats for each doc (trial, observation) dstats = {} all_h = [] sql3 = "SELECT doc_id, doc_label, topic_entropy FROM doctopic" for r in self.curin.execute(sql3): dstats[r[0]] = {} dstats[r[0]]['label'] = r[1] dstats[r[0]]['entropy'] = r[2] all_h.append(r[2]) all_h = np.array(all_h) dr = sps.describe(all_h) print(dr) for k in dr: print(k)
def compute_statistics(serie): """ Computa as estatísticas de SERIE utilizando stats.describe """ sizeData, (minimum,maximum),arithmeticMean,variance,skeness,kurtosis = stats.describe(serie) print "Size Data = ",sizeData , "Minimo,Maximo = ",(minimum,maximum), "Média = ", arithmeticMean , "Variância = ", variance
def _testCombine(): A = random(10000) B = 10 * random(1000) C = hstack([A,B]) run3 = RunningStatistics(A) run3.push(B) _compareDescriptions(run3.describe, describe(C))
def examples_normexpand(): skewnorm = SkewNorm_gen() rvs = skewnorm.rvs(5,size=100) normexpan = NormExpan_gen(rvs, mode='sample') smvsk = stats.describe(rvs)[2:] print('sample: mu,sig,sk,kur') print(smvsk) dmvsk = normexpan.stats(moments='mvsk') print('normexpan: mu,sig,sk,kur') print(dmvsk) print('mvsk diff distribution - sample') print(np.array(dmvsk) - np.array(smvsk)) print('normexpan attributes mvsk') print(mc2mvsk(normexpan.cnt)) print(normexpan.mvsk) from statsmodels.stats.momenthelpers import mvsk2mnc, mnc2mc mnc = mvsk2mnc(dmvsk) mc = mnc2mc(mnc) print('central moments') print(mc) print('non-central moments') print(mnc) pdffn = pdf_moments(mc) print('\npdf approximation from moments') print('pdf at', mc[0]-1,mc[0]+1) print(pdffn([mc[0]-1,mc[0]+1])) print(normexpan.pdf([mc[0]-1,mc[0]+1]))
def avg_interest_rtt(ec, run): logs_dir = ec.run_dir # Parse downloaded CCND logs (graph, content_names, interest_expiry_count, interest_dupnonce_count, interest_count, content_count) = ccn_parser.process_content_history_logs( logs_dir, ec.netgraph.topology) # statistics on RTT rtts = [content_names[content_name]["rtt"] \ for content_name in content_names.keys()] # sample mean and standard deviation sample = numpy.array(rtts) n, min_max, mean, var, skew, kurt = stats.describe(sample) std = math.sqrt(var) ci = stats.t.interval(0.95, n-1, loc = mean, scale = std/math.sqrt(n)) global metrics metrics.append((mean, ci[0], ci[1])) return mean
def fixme(): num_freqs = 8 min_frequency = 3.0 max_frequency = 60.0 morlet_transform = morlet.MorletWaveletTransform(5, np.logspace(np.log10(min_frequency), np.log10(max_frequency), num_freqs), 1000, 4096) # morlet_transform = morlet.MorletWaveletTransform(5, min_frequency, max_frequency, num_freqs, 1000, 4096) # morlet_transform = morlet.MorletWaveletTransform() # morlet_transform.init_flex(5, np.logspace(np.log10(min_frequency), np.log10(max_frequency), num_freqs), 1000, 4096) # morlet_transform.init(5, min_frequency, max_frequency, num_freqs, 1000, 4096) samplerate = 1000. frequency = 60.0 modulation_frequency = 80.0 duration = 4.096 n_points = int(np.round(duration*samplerate)) x = np.arange(n_points, dtype=np.float) signal = np.sin(x*(2*np.pi*frequency/samplerate))-np.cos(x*(2*np.pi*frequency/samplerate)) powers=np.empty(shape=(signal.shape[0]*num_freqs,), dtype=np.float) num_of_iterations = 100 # for i in range(num_of_iterations): # morlet_transform.multiphasevec(signal,powers) morlet_transform.multiphasevec(signal,powers) powers = powers.reshape(8,powers.shape[0]/8) print(describe(powers))
def calculate_statistics(self): """Przelicza statystyki zwiazane ze zbiorem kursow indeksu""" values = [] for quote in self.quotes: values.append(quote.value) n, (smin, smax), sm, sv, ss, sk = stats.describe(values) return (sm, sv, ss, sk)
def batch_filter(self, X, Y): n, min_max, mean, var, skew, kurt = stats.describe(Y) sd = math.sqrt(var) y_index = Y[(Y > mean - self.range * sd).values & (Y < mean + self.range * sd).values].index.tolist() X = X.iloc[y_index, :] Y = Y[y_index] return X, Y
def testMtimesNHands(d, p1, p2, deck = deckShuffle(6), M = 100, N = 10000): ''' Simulate a larger number of games and record the count averages against the house. ''' list1=[] for j in range(M): count = 0 for i in range(N): P = basicStrategy(d, p1, p2, deck) D = dealerPlay(d, deck) count += playerWin(D[0], P[0])[0] if len(deck)<10: deck = deckShuffle() list1.append(count) l = 0 for i in list1: l = l + i/(M*1.0) return l/(N*1.0), list1 plt.hist(list1, color='k', alpha = .15) plt.show() print '-------' print '''Summary stats for Strategy 1''' print '-------' Size, Range, Mean, variance, skewness, kurtosis = describe(list1) print 'Number of observations: ', Size print 'Mean: ', Mean print 'Min to Max: ', Range print 'Variance: ', variance print 'Standard Dev.: ', sqrt(variance) print 'Skewness: ', skewness print 'Kurtosis: ', kurtosis print '-------' print '-------'
import numpy as np from scipy import stats import math as mt dados = [40000, 18000, 12000, 250000, 30000, 140000, 300000, 40000, 800000] media = np.mean(dados) mediana = np.median(dados) quartis = np.quantile(dados, [0, 0.25, 0.75, 1]) desvio_padrao = np.std( dados, ddof=1 ) #ddof = 1 é usado pois estamos usando a população e não uma amostra describe = stats.describe(dados) print(describe) print(mt.sqrt(describe[3])) print(desvio_padrao)
s += w rank[mx] = 1 m_gmean2 = np.exp(m_gmean2 / s) # %% [code] top_mean = 0 s = 0 for n in [0, 1, 3, 7, 26]: top_mean += concat_sub.iloc[:, n] * scores[top[n]] s += scores[top[n]] top_mean /= s # %% [code] m_gmean = np.exp(0.3 * np.log(m_gmean1) + 0.2 * np.log(m_gmean2) + 0.5 * np.log(top_mean)) describe(m_gmean) # %% [code] concat_sub['isFraud'] = m_gmean concat_sub[['isFraud']].to_csv('stack_gmean.csv') all_files2 = glob.glob("/tmp/lgmodels/*.csv") all_files2.sort(key=lambda s: s.split('.')[1], reverse=True) aa_outs = [ pd.read_csv(all_files2[f], index_col=0) for f in range(len(all_files2)) ] aa_concat_sub = pd.concat(aa_outs, axis=1) # aa_concat_sub.columns = all_files aa_corr = aa_concat_sub.corr()
def run(data_path, cfg): print "Running SPC image conversion..." # get the base name of the directory base_dir_name = os.path.basename(os.path.abspath(data_path)) # list the directory for tif images print "Listing directory " + base_dir_name + "..." image_list = [] if cfg.get('MergeSubDirs', "false").lower() == "true": sub_directory_list = sorted( glob.glob(os.path.join(data_path, "[0-9]" * 10))) for sub_directory in sub_directory_list: print "Listing sub directory " + sub_directory + "..." image_list += glob.glob(os.path.join(sub_directory, "*.tif")) else: image_list += glob.glob(os.path.join(data_path, "*.tif")) image_list = sorted(image_list) # skip if no images were found if len(image_list) == 0: print "No images were found. skipping this directory." return # Get the total number of images in the directory total_images = len(image_list) # Create the output directories for the images and web app files subdir = os.path.join(data_path, '..', base_dir_name + '_static_html') if not os.path.exists(subdir): os.makedirs(subdir) image_dir = os.path.join(subdir, 'images') if not os.path.exists(image_dir): os.makedirs(image_dir) print "Starting image conversion and page generation..." # loop over the images and do the processing images_per_dir = cfg.get('ImagesPerDir', 2000) if cfg.get("BayerPattern").lower() == "rg": bayer_conv = cv2.COLOR_BAYER_RG2RGB if cfg.get("BayerPattern").lower() == "bg": bayer_conv = cv2.COLOR_BAYER_BG2RGB print "Loading images...\r", bundle_queue = Queue() for index, image in enumerate(image_list): reldir = 'images/' + str( images_per_dir * int(index / images_per_dir)).zfill(5) absdir = os.path.join( image_dir, str(images_per_dir * int(index / images_per_dir)).zfill(5)) filename = os.path.basename(image) if not os.path.exists(absdir): os.makedirs(absdir) bundle = {} bundle['image_path'] = image bundle['image'] = cvtools.import_image(os.path.dirname(image), filename, bayer_pattern=bayer_conv) bundle['data_path'] = data_path bundle['image_dir'] = absdir bundle['reldir'] = reldir bundle['cfg'] = cfg bundle['total_images'] = total_images bundle_queue.put(bundle) print "Loading images... (" + str(index) + " of " + str( total_images) + ")\r", #if index > 2000: # total_images = index # break # Get the number o proceess to use based on CPUs n_threads = multiprocessing.cpu_count() - 1 if n_threads < 1: n_threads = 1 # Create the set of processes and start them start_time = time.time() output_queue = Queue() processes = [] for i in range(0, n_threads): p = Process(target=process_bundle_list, args=(bundle_queue, output_queue)) p.start() processes.append(p) # Monitor processing of the images and save processed images to disk as they become available print "\nProcessing Images...\r", counter = 0 entry_list = [] use_jpeg = use_jpeg = cfg.get("UseJpeg").lower() == 'true' raw_color = cfg.get("SaveRawColor").lower() == 'true' while True: print "Processing and saving images... (" + str(counter).zfill( 5) + " of " + str(total_images).zfill(5) + ")\r", if counter >= total_images: break #if output_queue.qsize() == 0: try: output = output_queue.get() if output: entry_list.append(output['entry']) output_path = os.path.join(output['image_path'], output['prefix']) if use_jpeg: if raw_color: cv2.imwrite( os.path.join(output_path + "_rawcolor.jpeg"), output['features']['rawcolor']) cv2.imwrite(os.path.join(output_path + ".jpeg"), output['features']['image']) else: if raw_color: cv2.imwrite( os.path.join(output_path + "_rawcolor.png"), output['features']['rawcolor']) cv2.imwrite(os.path.join(output_path + ".png"), output['features']['image']) cv2.imwrite(os.path.join(output_path + "_binary.png"), output['features']['binary']) counter = counter + 1 except: time.sleep(0.05) # Record the total time for processing proc_time = int(math.floor(time.time() - start_time)) # Terminate the processes in case they are stuck for p in processes: p.terminate() print "\nPostprocessing..." # sort the entries by height and build the output entry_list.sort(key=itemgetter('maj_axis_len'), reverse=True) # Create histograms of several key features # image resolution in mm/pixel image_res = cfg.get('PixelSize', 22.1) / 1000 #print "Image resolution is set to: " + str(image_res) + " mm/pixel." # Get arrays from the dict of features total_images = len(entry_list) nbins = int(np.ceil(np.sqrt(total_images))) maj_len = np.array(map(itemgetter('maj_axis_len'), entry_list)) * image_res min_len = np.array(map(itemgetter('min_axis_len'), entry_list)) * image_res aspect_ratio = np.array(map(itemgetter('aspect_ratio'), entry_list)) orientation = np.array(map(itemgetter('orientation'), entry_list)) area = np.array(map(itemgetter('area'), entry_list)) * image_res * image_res unixtime = np.array(map(itemgetter('timestamp'), entry_list)) elapsed_seconds = unixtime - np.min(unixtime) file_size = np.array(map(itemgetter('file_size'), entry_list)) / 1000.0 #print unixtime total_seconds = max(elapsed_seconds) print "Total seconds recorded: " + str(total_seconds) if total_seconds < 1: total_seconds = 1 print "\nComputing histograms..." # Compute histograms all_hists = {} hist = np.histogram(area, nbins) all_hists['area'] = json.dumps(zip(hist[1].tolist(), hist[0].tolist())) hist = np.histogram(maj_len, nbins) all_hists['major_axis_length'] = json.dumps( zip(hist[1].tolist(), hist[0].tolist())) hist = np.histogram(min_len, nbins) all_hists['minor_axis_length'] = json.dumps( zip(hist[1].tolist(), hist[0].tolist())) hist = np.histogram(aspect_ratio, nbins) all_hists['aspect_ratio'] = json.dumps( zip(hist[1].tolist(), hist[0].tolist())) hist = np.histogram(elapsed_seconds, np.uint32(total_seconds)) all_hists['elapsed_seconds'] = json.dumps( zip(hist[1].tolist(), hist[0].tolist())) hist = np.histogram(orientation, nbins) all_hists['orientation'] = json.dumps( zip(hist[1].tolist(), hist[0].tolist())) hist = np.histogram(file_size, nbins) print "\nComputing stats..." all_hists['file_size'] = json.dumps(zip(hist[1].tolist(), hist[0].tolist())) # Compute general stats from features all_stats = {} all_stats['area'] = stats.describe(area) all_stats['major_axis_length'] = stats.describe(maj_len) all_stats['minor_axis_length'] = stats.describe(min_len) all_stats['aspect_ratio'] = stats.describe(aspect_ratio) all_stats['elapsed_seconds'] = stats.describe(elapsed_seconds) all_stats['orientation'] = stats.describe(orientation) all_stats['file_size'] = stats.describe(file_size) print "Building web app..." # Load html template for rendering template = "" with open(os.path.join('app', 'index.html'), "r") as fconv: template = fconv.read() # Define the render context from the processed histograms, images, and stats context = {} context['version'] = '1.0.1.05' context['total_images'] = total_images context['proc_time'] = proc_time context['duration'] = total_seconds context['compression_ratio'] = int( (1000.0 * 24 * total_images) / np.sum(file_size)) context['rois_per_second'] = total_images / context['duration'] context['kb_per_second'] = int(np.sum(file_size) / context['duration']) context['recording_started'] = datetime.datetime.fromtimestamp( np.min(unixtime)).strftime('%Y-%m-%d %H:%M:%S') context['app_title'] = "SPC Convert: " + base_dir_name context['dir_name'] = base_dir_name context['raw_color'] = raw_color context['image_res'] = image_res if use_jpeg: context['image_ext'] = '.jpeg' else: context['image_ext'] = '.png' context['stats_names'] = [{ "name": "Min" }, { "name": "Max" }, { "name": "Mean" }, { "name": "Standard Deviation" }, { "name": "Skewness" }, { "name": "Kurtosis" }] # definie the charts to display from the histogram data charts = [] for chart_name, data_values in all_hists.iteritems(): chart = {} chart['source'] = 'js/' + chart_name + '.js' chart['name'] = chart_name units = "" if chart_name == 'area': units = " (mm*mm)" if chart_name == 'major_axis_length' or chart_name == 'minor_axis_length': units = " (mm)" if chart_name == 'file_size': units = " (kB)" if chart_name == 'elapsed_seconds': units = " (s)" if chart_name == 'orientation': units = " (deg)" chart['title'] = 'Histogram of ' + chart_name + units chart['x_title'] = chart_name + units chart['y_title'] = 'counts' chart['stats_title'] = chart_name chart['data'] = data_values chart['stats'] = [] chart['stats'].append({ "name": "Min", "value": "{:10.3f}".format(all_stats[chart_name][1][0]) }) chart['stats'].append({ "name": "Max", "value": "{:10.3f}".format(all_stats[chart_name][1][1]) }) chart['stats'].append({ "name": "Mean", "value": "{:10.3f}".format(all_stats[chart_name][2]) }) chart['stats'].append({ "name": "Standard Deviation", "value": "{:10.3f}".format(math.sqrt(all_stats[chart_name][3])) }) chart['stats'].append({ "name": "Skewness", "value": "{:10.3f}".format(all_stats[chart_name][4]) }) chart['stats'].append({ "name": "Kurtosis", "value": "{:10.3f}".format(all_stats[chart_name][5]) }) charts.append(chart) context['charts'] = charts # render the html page and save to disk page = pystache.render(template, context) with open(os.path.join(subdir, 'spcdata.html'), "w") as fconv: fconv.write(page) # remove any old app files and try to copy over new ones try: shutil.rmtree(os.path.join(subdir, "css"), ignore_errors=True) shutil.copytree("app/css", os.path.join(subdir, "css")) shutil.rmtree(os.path.join(subdir, "js"), ignore_errors=True) shutil.copytree("app/js", os.path.join(subdir, "js")) except: print "Error copying supporting files for html." # Load roistore.js database for rendering template = "" with open(os.path.join('app', 'js', 'database-template.js'), "r") as fconv: template = fconv.read() context = {} context['image_items'] = entry_list context['table'] = base_dir_name # render the javascript page and save to disk page = pystache.render(template, context) with open(os.path.join(subdir, 'js', 'database.js'), "w") as fconv: fconv.write(page) print "Done."
def calculate_stats(generator, args, anchor_params): """ Calculates stats for anchor coverage over given dataset. Output stats include: - Average number of positive & negative anchors per image - Max/min number of positive anchors in dataset - Proportion of positive to negative anchors across dataset """ annotations_count = [] missed_annotations_count = [] positive_anchors_count = [] negative_anchors_count = [] num_images = generator.size() image_scale = None image_shape = None print("\n") for i in range(num_images): print("Processing {}/{} ".format(i, num_images), end="\r") annotations = generator.load_annotations(i) # Skip if there is no annotation label if len(annotations['labels']) == 0: continue # Resize the image and annotations # Save the relevant image properties (scale and shape) once and reuse - as we know that all images will have same properties # Saving these properties significantly speeds up the process if args.resize: if (image_scale is None): image = generator.load_image(i) image, image_scale = generator.resize_image(image) image_shape = image.shape annotations['bboxes'] *= image_scale else: if (image_shape is None): image = generator.load_image(i) image_shape = image.shape anchors = anchors_for_shape(image_shape, anchor_params=anchor_params) positive_indices, _, _, max_indices = compute_gt_annotations_for_visualisation( anchors, annotations['bboxes'], negative_overlap=args.negative_overlap_iou, positive_overlap=args.positive_overlap_iou) num_annotations = annotations['bboxes'].shape[0] missed_annotations = num_annotations - len( set(max_indices[positive_indices])) num_positive_anchors = annotations['bboxes'][ max_indices[positive_indices], :].shape[0] annotations_count.append(num_annotations) missed_annotations_count.append(missed_annotations) positive_anchors_count.append(num_positive_anchors) negative_anchors_count.append(anchors.shape[0] - num_positive_anchors) prop = sum(positive_anchors_count) / sum(negative_anchors_count) missed_annotations_stats = stats.describe(missed_annotations_count) positive_anchors_stats = stats.describe(positive_anchors_count) negative_anchors_stats = stats.describe(negative_anchors_count) print("##############################") print( f"\nResults for parameters:\nPositive IoU: {args.positive_overlap_iou}\nNegative IoU: {args.negative_overlap_iou}" ) print( f"\nAnchor parameters: \nsizes: {anchor_params.sizes}\nstrides: {anchor_params.strides}\nratios: {anchor_params.ratios}\nscales: {anchor_params.scales}" ) print("\n-------") print(f"\nTotal annotations: {sum(annotations_count)}") print( f"\nMissed annotations: \nMin, Max: {missed_annotations_stats.minmax} \nMean: {missed_annotations_stats.mean:.3f}" ) print(f"\nProportion of pos/neg anchors: {prop:.5f}") print( f"\nPositive anchors: \nMin, Max: {positive_anchors_stats.minmax}\nMean: {positive_anchors_stats.mean:.3f}" ) print( f"\nNegative anchors: \nMin, Max: {negative_anchors_stats.minmax}\nMean: {negative_anchors_stats.mean:.3f}" ) print("\n")
spixels = [] for i in range(0, 13): # for every pixel: for j in range(0, 13): spixels.append(pixels[i + 200, j + 200]) print('first sample:', fpixels) print('second sample:', spixels) averagef = grades_average(fpixels) averages = grades_average(spixels) skewf = skew(fpixels) skews = skew(spixels) nf, min_maxf, meanf, varf, skewf, kurtf = stats.describe(fpixels) ns, min_maxs, means, vars, skews, kurts = stats.describe(spixels) print('meanf:', meanf) print('means:', means) print('varf:', varf) print('vars:', vars) print('skewf:', skewf) print('skews:', skews) print('kurtf:', kurtf) print('kurts:', kurts) #variancef = grades_variance(fpixels, averagef) #variances = grades_variance(spixels, averages) ax = pl.subplot(111) #ax.bar(2, meanf, width=1) #ax.bar(4, means, width=1)
nvar.myhist(risk_factors["CP1"].diff().dropna()) nvar.myhist(risk_factors["CP2"].diff().dropna()) nvar.myhist(risk_factors["CP3"].diff().dropna()) # the correlation matrix of daily change in CP1\CP2\CP3 risk_factors.diff().dropna().corr() # Covariance matrix of daily change in CP1,CP2,CP3 risk_factors.diff().dropna().cov() # Cholesky decompostion C = linalg.cholesky(risk_factors.diff().dropna().cov()) import scipy.stats as stats print( "means for daily changes in 3 componets:\t", stats.describe(risk_factors.diff().dropna()).mean, ) print( "variance for daily changes 3 componets:\t", stats.describe(risk_factors.diff().dropna()).variance, ) # Use montecarlo simulation to analyze the daily Var # Mainly use the first 3 risk-factors def myvar(C, dstd, components, num_of_sim=10000): # COV=C*C' # dstd is the standard deviation of the real ex quots # num_of_sim is the number of simulation # componetns is the first 3 principle components of normalized ex quots result = pd.DataFrame(columns=quots_dropna.columns)
def main(): loc_mention_embeddings = "/Users/elliotschumacher/Dropbox/git/synonym_detection/resources/bilm/out_max/mention_embeddings" loc_concept_embeddings = "/Users/elliotschumacher/Dropbox/git/synonym_detection/resources/bilm/out_max/embedding_output" dev_file = "/Users/elliotschumacher/Dropbox/concept/share_clef/SPLIT_2017-12-08-13-38-01/train/dev_fix_concrete.tar" test_dict = {} for (comm, filename) in file_io.CommunicationReader(dev_file): for menset in comm.entityMentionSetList[0].mentionList: test_dict[menset.uuid.uuidString] = menset with open(os.path.join(loc_mention_embeddings, 'mention_representations.npy'), 'rb') as mention_representations_npy, \ open(os.path.join(loc_mention_embeddings, 'mention_to_info.pkl'), 'rb') as mention_to_info_pkl, \ open(os.path.join(loc_mention_embeddings, 'id_to_mention_info.pkl'), 'rb') as id_to_mention_info_pkl: mention_representations = np.load(mention_representations_npy) id_to_mention_info = pickle.load(id_to_mention_info_pkl) mention_to_info = pickle.load(mention_to_info_pkl) with open(os.path.join(loc_concept_embeddings, 'concept_representations.npy'), 'rb') as concept_representations_npy, \ open(os.path.join(loc_concept_embeddings, 'id_to_concept_name_alt.pkl'), 'rb') as id_to_concept_name_alt_pkl, \ open(os.path.join(loc_concept_embeddings, 'concept_to_id_name_alt.pkl'), 'rb') as concept_to_id_name_alt_pkl: concept_representations = np.load(concept_representations_npy) id_to_concept_info = pickle.load(id_to_concept_name_alt_pkl) cui_to_concept_info = pickle.load(concept_to_id_name_alt_pkl) output_file = "elmo_exp.csv" result_list = [] input_csv = "/Users/elliotschumacher/Dropbox/git/concept-linker/results/run_2019_03_06_11_01_30_b13/eval_759.csv" eval_csv = pd.DataFrame.from_csv(input_csv) cos_sims = {} shuffled_keys = list(mention_to_info.keys()) for mention_uuid1 in list(mention_to_info.keys()): if mention_uuid1 in test_dict: menset = test_dict[mention_uuid1] if menset.entityType in cui_to_concept_info: random.shuffle(shuffled_keys) for i in range(10): mention_uuid2 = shuffled_keys[i] if mention_uuid1 != mention_uuid2: m_indx1 = mention_to_info[mention_uuid1]["index"] m_indx2 = mention_to_info[mention_uuid2]["index"] cos_sim = cosine_similarity( [mention_representations[m_indx1, :]], [mention_representations[m_indx2, :]])[0][0] min_uuid = min(mention_uuid1, mention_uuid2) max_uuid = max(mention_uuid1, mention_uuid2) cos_sims[min_uuid, max_uuid] = cos_sim print("Stats for mention cos similarity") print(describe(list(cos_sims.values()))) outer_concept_list = list(cui_to_concept_info) inner_concept_list = list(cui_to_concept_info) random.shuffle(outer_concept_list) cos_sims_cui = {} for cui1 in outer_concept_list[:1000]: c_indx1 = cui_to_concept_info[cui1][0]["index"] c_indexes = random.sample(range(0, len(inner_concept_list)), 10) for cui2_indx in c_indexes: cui2 = inner_concept_list[cui2_indx] c_indx2 = cui_to_concept_info[cui2][0]["index"] if c_indx1 != c_indx2: cos_sim = \ cosine_similarity([concept_representations[c_indx1, :]], [concept_representations[c_indx2, :]])[0][0] min_uuid = min(cui1, cui2) max_uuid = max(cui1, cui2) cos_sims_cui[min_uuid, max_uuid] = cos_sim print("Stats for concept cos similarity") print(describe(list(cos_sims_cui.values()))) #df = pd.DataFrame([list(cos_sims.keys()), list(cos_sims_cui.keys())], columns=['Mention', 'Concept']) plt.hist([list(cos_sims.values()), list(cos_sims_cui.values())], color=['r', 'b'], alpha=0.5) plt.gca().legend(('Mentions', 'Concepts')) plt.show() for _, row in eval_csv.iterrows(): menset = test_dict[row["~~mention_uuid"]] if menset.entityType in cui_to_concept_info: mention_info = mention_to_info[menset.uuid.uuidString] concept_info = cui_to_concept_info[menset.entityType][0] m_indx = mention_info["index"] c_indx = mention_info["index"] sentence = " ".join([ w.text.strip() for w in menset.tokens.tokenization.tokenList.tokenList ]) m_rep = mention_representations[m_indx, :] c_rep = concept_representations[c_indx, :] cos_dist = cdist(concept_representations, m_rep.reshape(1, -1), metric='cosine') ranking = st.rankdata(cos_dist) gold_rank = ranking[c_indx] cos_sim = cosine_similarity([m_rep], [c_rep])[0][0] print("Cosine dist:{0}, sim:{1}".format(cos_dist[c_indx], cos_sim)) row["cos_dist"] = cos_sim row["sentence"] = sentence row["cos_rank"] = gold_rank result_list.append(row) dataframe = pd.DataFrame.from_records(result_list) dataframe.to_csv(output_file, index=False)
""" # %% # построение гистограмм с выводом описательной характеристики quantitative_variables = [ 'budget', 'revenue', 'runtime', 'vote_average', 'release_year' ] for variable in quantitative_variables: plt.hist(data[variable], 12, density=1, facecolor='c') plt.grid(True) plt.xlabel("Значения") plt.ylabel("Относительная частота") plt.title(f'Распределение по {variable}') plt.savefig(f'{variable}.png', bbox_inches='tight') plt.show() print(sp.describe(data[variable], ddof=1, bias=False)) print(data[variable].describe()) # %% # избавляемся от "|" в столбцах director, cast, genres data2 = data data2['director'] = data.director.apply(lambda x: str(x).split('|')) data2['cast'] = data.cast.apply(lambda x: str(x).split('|')) data2['genres'] = data.genres.apply(lambda x: str(x).split('|')) data3 = data2.explode('director') data4 = data3.explode('cast') data5 = data4.explode('genres') data5 # %% # построение гистограммы распредления жанров
for line in codecs.open("u.data","r",encoding="latin-1"): user,movie,rating,date=line.strip().split("\t") user_index=int(user)-1 movie_index=int(movie)-1 R[user_index,movie_index]=float(rating) print(R[0,10]) #%% 12-3 from scipy import stats user_mean_li=[] for i in range(0,R.shape[0]): user_rating=[x for x in R[i] if x>0.0] user_mean_li.append(stats.describe(user_rating).mean) stats.describe(user_mean_li) #%% import matplotlib.pyplot as plt #plt.plot(user_info_li) #plt.plot(movie_info_li) #plt.plot(user_mean_li) #plt.plot(R) #행렬시각화...? print(R.shape) #(943, 1682) print(R.shape[0]) #943 print(R.shape[1]) #1682 print(R[0,2]) print(R[0])
def test_stellar_structure_equations( file_name="../Example Stars/low_mass_star.txt", config=StellarConfiguration()): data = np.loadtxt(file_name).T * example_star_units[:, None] diff = config.T_prime_radiative(data[ex_r_index, :], data[ex_rho_index, :], data[ex_T_index, :], data[ex_L_index, :]) - \ config.T_prime_convective(data[ex_r_index, :], data[ex_rho_index, :], data[ex_T_index, :], data[ex_M_index, :]) rho_prime_actual = config.rho_prime(data[ex_r_index, :], data[ex_rho_index, :], data[ex_T_index, :], data[ex_M_index, :], data[ex_L_index, :]) rho_prime_expected = data[ex_rho_prime_index, :] print( "Rho Prime Percentage Error:", stats.describe( (rho_prime_actual - rho_prime_expected) / rho_prime_expected)) T_prime_actual = config.T_prime(data[ex_r_index, :], data[ex_rho_index, :], data[ex_T_index, :], data[ex_M_index, :], data[ex_L_index, :]) T_prime_expected = data[ex_T_prime_index, :] print( "T Prime Percentage Error:", stats.describe((T_prime_actual - T_prime_expected) / T_prime_expected)) M_prime_actual = config.M_prime(data[ex_r_index, :], data[ex_rho_index, :]) M_prime_expected = data[ex_M_prime_index, :] print( "M PrimePercentage Error:", stats.describe((M_prime_actual - M_prime_expected) / M_prime_expected)) L_prime_actual = config.L_prime(data[ex_r_index, :], data[ex_rho_index, :], data[ex_T_index, :]) L_prime_expected = data[ex_L_prime_index, :] print( "L prime Percentage Error:", stats.describe((L_prime_actual - L_prime_expected) / L_prime_expected)) P_actual = config.P(data[ex_rho_index, :], data[ex_T_index, :]) P_expected = data[ex_P_index, :] print("P Percentage Error:", stats.describe((P_actual - P_expected) / P_expected)) P_degeneracy_actual = config.P_degeneracy(data[ex_rho_index, :]) P_degeneracy_expected = data[ex_P_degeneracy_index, :] print( "P_degeneracy Percentage Error:", stats.describe((P_degeneracy_actual - P_degeneracy_expected) / P_degeneracy_expected)) P_gas_actual = config.P_gas(data[ex_rho_index, :], data[ex_T_index, :]) P_gas_expected = data[ex_P_gas_index, :] print("P_gas Percentage Error:", stats.describe((P_gas_actual - P_gas_expected) / P_gas_expected)) kappa_actual = config.kappa(data[ex_rho_index, :], data[ex_T_index, :]) kappa_expected = data[ex_kappa_index, :] print("Kappa Percentage Error:", stats.describe((kappa_actual - kappa_expected) / kappa_expected))
model = Word2Vec(sentences=corpus, size=size, window=window, min_count=1, workers=multiprocessing.cpu_count(), sg=0) return model #%% Hyperparameter optimization total_word_num(data) num_word_by_document = [] for i in range(len(data)): num_word_by_document.append(len(data[i])) from scipy import stats stats = stats.describe(num_word_by_document) stats.mean np.sqrt(stats.variance) size = [100, 200, 300, 400, 500] window = [5, 6, 7, 8, 9, 10] # Getting the closest word to the keyword print("Word2Vec model training") words_by_window = [] for w in window: words_by_size = [] for s in size: model = model_w2v(data, s, w) words_by_size.append(model.wv.most_similar(keyword, topn=10)) print( "Central words to the keyword (hidden layer: {}, window: {}):\n{}\n"
def topological_features(floormap, prepare_for_doom=False): """ Create the level graph from the floormap and compute some topological features on the graph. :param floormap: :param prepare_for_doom: (Default:False) If true each node will also contain vertices and walls information for converting the level to a WAD file. :return: (room map, room_graph, dict of metrics) """ roommap, room_graph, dist = create_graph(floormap, return_dist=True, room_coordinates=prepare_for_doom) room_props = regionprops(roommap) for r in range(1, roommap.max() + 1): # Room Size room_graph.node[r]["area"] = room_props[r - 1]["area"] room_graph.node[r]["perimeter"] = room_props[r - 1]["perimeter"] mask = (roommap == r) max_dist = np.max(mask * dist) room_graph.node[r]["max_dist"] = max_dist room_graph.node[r]["centroid"] = room_props[r - 1]["centroid"] # TODO: Add information about other maps, such as enemies, etc. centroid_distance = dict() for i, j in room_graph.edges(): # Decorate the edges with the distance if i == 0 or j == 0: continue centroid_distance[(i, j)] = np.linalg.norm( np.asarray(room_graph.node[i]["centroid"]) - np.asarray(room_graph.node[j]["centroid"])).item() nx.set_edge_attributes(room_graph, name='centroid_distance', values=centroid_distance) # To compute correct metrics we need to remove node 0, which is the background graph_no_background = room_graph.copy() graph_no_background.remove_node(0) metrics = dict() # Computing metrics from "Predicting the Global Structure of Indoor Environments: A costructive Machine Learning Approach", (Luperto, Amigoni, 2018) ##### metrics["nodes"] = len(nx.nodes(graph_no_background)) pl_list = list() diam_list = list() assort_list = list() for cc in nx.connected_component_subgraphs(graph_no_background): if len(cc.edges()) > 0: pl_list += [nx.average_shortest_path_length(cc)] diam_list += [nx.diameter(cc)] assort_list += [ nx.degree_assortativity_coefficient(graph_no_background) ] metrics["avg-path-length"] = np.mean(pl_list) if len(pl_list) > 0 else 0 metrics["diameter-mean"] = np.mean(diam_list) if len(diam_list) > 0 else 0 metrics["art-points"] = len( list(nx.articulation_points(graph_no_background))) metrics["assortativity-mean"] = nx.degree_assortativity_coefficient( graph_no_background) if len(cc.edges()) > 0 else 0 try: # Centrality measures metrics["betw-cen"] = nx.betweenness_centrality(graph_no_background) metrics["closn-cen"] = nx.closeness_centrality(graph_no_background) # These metrics may throw exceptions # metrics["eig-cen"] = nx.eigenvector_centrality_numpy(graph_no_background) # metrics["katz-cen"] = nx.katz_centrality_numpy(graph_no_background) # Describing node stat distributions and removing them from the dict for met in ['betw-cen', 'closn-cen']: values = list(metrics['{}'.format(met)].values()) st = describe(values) metrics["{}-min".format(met)] = st.minmax[0] metrics["{}-max".format(met)] = st.minmax[1] metrics["{}-mean".format(met)] = st.mean metrics["{}-var".format(met)] = st.variance metrics["{}-skew".format(met)] = st.skewness metrics["{}-kurt".format(met)] = st.kurtosis # Quartiles metrics["{}-Q1".format(met)] = np.percentile(values, 25) metrics["{}-Q2".format(met)] = np.percentile(values, 50) metrics["{}-Q3".format(met)] = np.percentile(values, 75) del metrics[met] except: warnings.warn("Unable to compute centrality for this level") metrics["betw-cen"] = np.nan metrics["closn-cen"] = np.nan ##### # Metrics on distance map. Ignoring black space surrounding the level cleandist = np.where(dist == 0, np.nan, dist) dstat = describe(cleandist, axis=None, nan_policy='omit') metrics["distmap-max".format(met)] = dstat.minmax[1] metrics["distmap-mean".format(met)] = dstat.mean metrics["distmap-var".format(met)] = dstat.variance metrics["distmap-skew".format(met)] = dstat.skewness metrics["distmap-kurt".format(met)] = dstat.kurtosis # Quartiles metrics["distmap-Q1".format(met)] = np.percentile(values, 25) metrics["distmap-Q2".format(met)] = np.percentile(values, 50) metrics["distmap-Q3".format(met)] = np.percentile(values, 75) return roommap, room_graph, metrics
def merge_text_events_with_timeseries(problem_type, data, text_reader, w2i_lookup, conf_max_len, dump_information=False, fname=None): text_not_found = 0 sucessful = 0 text_event_lens = [] data_with_text = [] if dump_information: text_count_by_hour = {} patient_count_by_hour = {} text_len_by_hour = {} maximum_index_output = -1 for batch in data: ip, op, _ = batch['data'] X = ip[0] mask = ip[2] if problem_type == 'decom': ts = batch['decomp_ts'] output = op[1] elif problem_type == 'los': ts = batch['los_ts'] output = op[2] maximum_index_output = max(maximum_index_output, output.max()) assert_shapes(X, mask, output) text_event_dictionary = text_reader.read_all_text_events_json( batch['names']) max_len = -1 for i, name in enumerate(batch['names']): if name not in text_event_dictionary: continue text_events = text_event_dictionary[name] hours = map(lambda x: x[0], text_events) hours = list(filter(lambda h: h <= X.shape[1], hours)) max_len = max(max_len, len(hours)) final_items = [] for i, name in enumerate(batch['names']): # timerow represents 1 patient. # first timestep is 5. if name not in text_event_dictionary: text_not_found += 1 continue else: sucessful += 1 # if sucessful % 5000 == 0: # print("Scccessful:", sucessful) mask_i = mask[i] X_i = X[i] output_i = output[i] ts_i = ts[i] if len(ts_i) == 0: continue text_events = text_event_dictionary[name] assert len(text_events[0]) == 2 hours = list(map(lambda x: x[0], text_events))[:max_len] texts = list(map(lambda x: x[1], text_events))[:max_len] if dump_information: assert fname is not None count = 0 length = 0 for t in ts_i: if t in patient_count_by_hour: patient_count_by_hour[t] += 1 else: patient_count_by_hour[t] = 1 if t in hours: count += 1 length += len(texts[hours.index(t)]) if t not in text_count_by_hour: text_count_by_hour[t] = 0 text_len_by_hour[t] = 0 text_count_by_hour[t] += count text_len_by_hour[t] += length assert len(hours) == len(texts) text_event_lens.append(len(texts)) # generate 2D TimeMask for 1DConvolution. time_mask = np.zeros((mask_i.shape[0], max_len)) if max(ts_i) >= mask_i.shape[0]: ts_i = [ti for ti in ts_i if ti < mask_i.shape[0]] for t in ts_i: for ind, h in enumerate(hours): if h > t: break time_mask[t][ind] = t-h+1 assert time_mask[t][ind] >= 0 final_items.append( {'X': X_i, 'Out': output_i, 'Mask': mask_i, 'Text': texts, 'TimeMask': time_mask}) if len(final_items) >= 1: # Now post process. X = np.stack(list(map(lambda x: x['X'], final_items))) Output = np.stack(list(map(lambda x: x['Out'], final_items))) Mask = np.stack(list(map(lambda x: x['Mask'], final_items))) TimeMask = np.stack( list(map(lambda x: x['TimeMask'], final_items))) Texts, _ = generate_tensor_text( list(map(lambda x: x['Text'], final_items)), w2i_lookup, conf_max_len) try: assert_shapes(X, Mask, Output, TimeMask, Texts) data_with_text.append( {'X': X, 'Output': Output, 'Mask': Mask, 'TimeMask': TimeMask, 'Texts': Texts}) except: print("Merge failed due to shape issue") print("Text Not found for patients: ", text_not_found) print("Sucessful for patients: ", sucessful) print("Maximum value in Output: ", maximum_index_output) text_event_lens = np.array(text_event_lens) from scipy import stats print(stats.describe(text_event_lens)) if dump_information: with open(fname, 'wb') as f: pickle.dump({'text_count_by_hour': text_count_by_hour, 'patient_count_by_hour': patient_count_by_hour, 'text_lens_by_hour': text_len_by_hour}, f, pickle.HIGHEST_PROTOCOL) return data_with_text, text_event_lens
def runAnalysis(trainFilename, testFilename, labelFilename, labelCol, labelName, trainYr=1999, testYr=2009, grams=(2, 5), addWrdCnt=False, addCntry=False): # Incorporate gram specific path if grams[1] == grams[0]: gramDir = 'grams' + str(grams[1]) if grams[1] != grams[0]: gramDir = 'grams' + str(grams[0]) + '_' + str(grams[1]) ### # Load data trainData = buildData(textFile=trainFilename, sYr=trainYr, labelFile=labelFilename) testData = buildData(textFile=testFilename, sYr=testYr, labelFile=labelFilename) #### # Divide into train and test and convert # to appropriate format vectorizer = TfidfVectorizer(ngram_range=grams) xTrain = vectorizer.fit_transform(trainData[:, 1]) yTrain = np.array([int(x) for x in list(trainData[:, labelCol])]) print('Saving tfidf') vec_file = '{}_tfidf.pkl'.format(labelName) joblib.dump(vectorizer, vec_file) xTest = vectorizer.transform(testData[:, 1]) yTest = np.array([int(x) for x in list(testData[:, labelCol])]) # Add other features if (addWrdCnt): wTrain = csr_matrix(np.array(list(trainData[:, 2]))).transpose() wTest = csr_matrix(np.array(list(testData[:, 2]))).transpose() xTrain = hstack((xTrain, wTrain)) xTest = hstack((xTest, wTest)) if (addCntry): cntryYr = [x.split('_')[0] for x in trainData[:, 0]] from pandas import factorize cntryYr = factorize(cntryYr)[0] cTrain = csr_matrix(np.array(list(cntryYr))).transpose() cntryYr = [x.split('_')[0] for x in testData[:, 0]] cntryYr = factorize(cntryYr)[0] cTest = csr_matrix(np.array(list(cntryYr))).transpose() xTrain = hstack((xTrain, cTrain)) xTest = hstack((xTest, cTest)) ##### # Run SVM with linear kernel print('Fitting SVM') svmClass = LinearSVC().fit(xTrain, yTrain) yConfSVM = list(svmClass.decision_function(xTest)) yPredSVM = svmClass.predict(xTest) svm_class_file = '{}_svm_class.pkl'.format(labelName) joblib.dump(svmClass, svm_class_file) print('SVM2') svmClass_2 = SVC(kernel='linear', probability=True).fit(xTrain, yTrain) yProbSVM = svmClass_2.predict_proba(xTest) svm_class2_file = '{}_svm_class2.pkl'.format(labelName) joblib.dump(svmClass_2, svm_class2_file) ##### # Performance stats outpath = _get_data('../../results', gramDir) if addWrdCnt: outName = (labelName + '_train' + trainFilename.split('_')[1] + '_test' + testFilename.split('_')[1] + '_xtraFt' + '.txt') outName = os.path.join(outpath, outName) else: outName = (labelName + '_train' + trainFilename.split('_')[1] + '_test' + testFilename.split('_')[1] + '.txt') outName = os.path.join(outpath, outName) orig_stdout = sys.stdout out = open(outName, 'w') sys.stdout = out print '\nTrain Data from: ' + trainFilename print '\t\tTrain Data Cases: ' + str(xTrain.shape[0]) print '\t\tMean of y in train: ' + str(round(describe(yTrain)[2], 3)) + '\n' print 'Test Data from: ' + testFilename print '\t\tTest Data Cases: ' + str(xTest.shape[0]) print '\t\tMean of y in test: ' + str(round(describe(yTest)[2], 3)) + '\n' prStats('SVM', grams, yTest, yPredSVM) out.close() sys.stdout = orig_stdout ##### # Print data with prediction trainCntry = np.array([[x.split('_')[0].replace(',', '')] for x in list(trainData[:, 0])]) trainYr = np.array([[x.split('_')[1]] for x in list(trainData[:, 0])]) testCntry = np.array([[x.split('_')[0].replace(',', '')] for x in list(testData[:, 0])]) testYr = np.array([[x.split('_')[1]] for x in list(testData[:, 0])]) vDat = np.array( [[x] for x in flatten([['train'] * trainData.shape[0], ['test'] * testData.shape[0]])]) trainLab = np.array([[x] for x in list(trainData[:, labelCol])]) testLab = np.array([[x] for x in list(testData[:, labelCol])]) if labelName[0:6] == 'polCat': probSVM = [';'.join(['%s' % x for x in row]) for row in yProbSVM] confSVM = [ ';'.join(['%s' % x for x in sublist]) for sublist in yConfSVM ] if labelName[0:6] != 'polCat': probSVM = [x[1] for x in yProbSVM] confSVM = yConfSVM filler = [-9999] * trainData.shape[0] predSVM = np.array([[x] for x in flatten([filler, list(yPredSVM)])]) probSVM = np.array([[x] for x in flatten([filler, probSVM])]) confSVM = np.array([[x] for x in flatten([filler, confSVM])]) output = np.hstack((np.vstack( (trainCntry, testCntry)), np.vstack( (trainYr, testYr)), vDat, np.vstack( (trainLab, testLab)), np.hstack((confSVM, probSVM, predSVM)))) outCSV = outName.replace('.txt', '.csv') outCSV = os.path.join(outpath, outCSV) with open(outCSV, 'wb') as f: f.write(b'country,year,data,' + labelName + ',confSVM,probSVM,predSVM\n') np.savetxt(f, output, delimiter=',', fmt="%s") # Print top features for classes from SVM infFeatures(outpath, outName.replace('.txt', '._wrdFtr.csv'), vectorizer, svmClass, 500)
def doStats(warmupdata, Data, doGraphs=False, doWriteStdout=False, graphFilenameStub=''): # Mean, min, max, variance (nsamples, (min, max), mean, unbiasedvar, skew, kurtosis) = stat.describe( Data ) # Unbiasedvar is actually the reduced-bias estimator of population variance (~1/(N-1)...) # Standard error of mean: sem = stat.sem(Data) # Not yet correlation corrected # Compute the autocorrelation function: Data_dcshift = Data - mean #DataNorm=np.sum(np.square(Data_dcshift)) cor = np.correlate(Data_dcshift, Data_dcshift, mode='same') / unbiasedvar autocor = cor[int(cor.size / 2):] autocor = autocor / np.arange(nsamples - 1, nsamples - 1 - autocor.size, -1) # Note -1 for 0-based indexing # Choose where to cutoff the autocorrelation time sum cutoff = autocor.size j = 0 while j < cutoff: if autocor[j] < np.sqrt(2. / (nsamples - j)): cutoff = np.minimum(cutoff, 5 * j) j = j + 1 # Compute correlation time kappa = 1. + 2. * np.sum(autocor[1:int(2. * cutoff / 5.)]) # We can also make an array of all possible cutoffs if doGraphs: kappa_cutoffdep = np.ones(autocor.size) for jc in range(1, autocor.size): kappa_cutoffdep[jc] = 1 + 2 * np.sum(autocor[1:jc]) # Update the standard error of the mean for a correlation correction semcc = sem * np.sqrt(kappa) # Manual (non-Numpy) autocorrelation function for transparency - verified equal #j=0 #cutoff=nsamples #autocor_m=np.zeros(cutoff) #while j < cutoff: # autocor_m[j]=0. # for i in range(0,nsamples-j): # autocor_m[j] = autocor_m[j] + (Data[warmup+i]-mean)*(Data[warmup+i+j]-mean) # autocor_m[j]=autocor_m[j]/(unbiasedvar*(nsamples-j)) # if autocor_m[j] < np.sqrt(2./(nsamples-j)): # cutoff = np.minimum(cutoff,5*j) # j=j+1 if doWriteStdout == True: print(" - Mean = ", mean, " +/- ", semcc) print(" - Equilibrated samples = ", nsamples) print(" - Correlation time = ", kappa) print(" - Effective # samples = ", nsamples / kappa) print(" - Reduced-bias variance = ", unbiasedvar) # note that there is no unbiased estimator for the population standard deviation. We can use sqrt(var) as a indicative estimator. print( " - S.D. (unbiased, biased) = ", np.sqrt(unbiasedvar), np.std(Data, ddof=0) ) # ddof is correction to 1/N...using ddof=1 returns regular reduced-bias estimator print(" - Skewness = ", skew) print(" - Kurtosis = ", kurtosis) print(" - Min, Max = ", min, max) #print ( Reduced bias estimator - test vs. above from sqrt(var)) if doGraphs: import matplotlib.pyplot as pl # If we import pylab instead, we get matplotlib.pyplot and numpy both under the global namespace for MATLAB-like syntax and reduced typing (useful for interactive use) # Plot some things pl.figure(num=1, figsize=(15, 10)) # pl.subplot(221) # Select first panel in 2x2 grid... pl.title("Trace of Data") pl.plot(np.concatenate([warmupdata, Data])) pl.ylim([0.98 * min, 1.02 * max]) pl.axhline(mean, color='red') pl.axvspan(0, len(warmupdata), color='green', alpha=0.5) pl.axvline(len(warmupdata), color='green') pl.xlabel("Sample index") # # Generate a histogram of the data # Not needed now - just do a histogram plot directly #hist=stat.histogram(Data) #print hist #hist=np.histogram(Data) #print hist[0] #print hist[1] #print type(hist[0]) pl.subplot(222) pl.title("Histogram of Data") n, bins, patches = pl.hist(Data, 25, normed=1, facecolor="green", alpha=0.5) #ygauss=stat.norm(bins,mean,np.sqrt(unbiasedvar)) pl.plot(bins, stat.norm.pdf(bins, mean, np.sqrt(unbiasedvar)), 'r--') # pl.subplot(223) pl.plot(autocor[:cutoff]) x = np.arange(0, cutoff) pl.plot(x, np.exp(-x / kappa)) pl.title("Autocorrelation function") pl.xlabel('$\\tau$') pl.ylabel('$C\\left(\\tau\\right)$') pl.axhline(0, color='black') #pl.xlim(0,plotxmax) # pl.subplot(224) pl.plot(kappa_cutoffdep) pl.title("Correlation time estimator vs. cutoff") #pl.xlabel('$\\tau_{cut}$') #pl.ylabel('$\\Kappa$') #pl.axhline(0,color='black') # pl.savefig("stats_{0}.png".format(graphFilenameStub)) return (nsamples, (min, max), mean, semcc, kappa, unbiasedvar, autocor)
start_t = time() for theta in np.linspace(0, tau, RESOLUTION)[1053:]: # new_circle = rotate_list_of_points(circle, theta) sextant = ary(circle_to_sextant(new_circle)) ''' #plot the initial state plt.scatter(sextant[:,0], sextant[:,1]) plt.title("Intital distribution") plt.show() ''' all_points = [Point(p) for p in sextant] step = 0 force_mag = [quadrature(p.get_force(all_points)) for p in all_points] des = describe(force_mag) while des.minmax[ 1] > F_LIMIT and des.mean > F_LIMIT / 2.5: #For F_LIMIT = .5% of radius, this should take a bit more than 200 iterations. forces = ary([p.get_force(all_points) for p in all_points]) force_mag = [quadrature(f) for f in forces] ''' plt.hist(force_mag, bins = 200) sns.kdeplot(force_mag) plt.title("forces magnitudes at step "+str(step)) plt.show() ''' #get the statistical information about the forces' magnitudes. des = describe(force_mag) #calculate step size to take if des.minmax[1] < F_LIMIT * 15 and des.mean < F_LIMIT * 15 / 2.5:
# %% j1.mode() # %% [markdown] # ## combinaciones, permutaciones y exponenciales # ## scipy.special # %% from scipy import stats import numpy as np import matplotlib.pyplot as plt # %% # %% x = [1, 2, 3, 4, 5] stats.describe(x) # %% normal = stats.norm() # %% x = np.arange(-3, 3.1, 0.1) # %% x # %% plt.plot(x, normal.pdf(x)) plt.show() # %% normal.expect() # %% normal.interval(0.95) # %%
def extractfeatures(self, DICOMImages, image_pos_pat, image_ori_pat, series_path, phases_series, VOI_mesh): """ Start pixVals for collection pixel values at VOI """ pixVals_margin = [] pixVals = [] Fmargin = {} voxel_frameS = {} # necessary to read point coords VOIPnt = [0, 0, 0] ijk = [0, 0, 0] pco = [0, 0, 0] for i in range(len(DICOMImages)): # find mapping to Dicom space [transformed_image, transform_cube ] = Display().dicomTransform(DICOMImages[i], image_pos_pat, image_ori_pat) if (i == 0): # create mask from segmenation np_VOI_mask = self.createMaskfromMesh(VOI_mesh, transformed_image) for j in range(VOI_mesh.GetNumberOfPoints()): VOI_mesh.GetPoint(j, VOIPnt) # extract pixID at location VOIPnt pixId = transformed_image.FindPoint(VOIPnt[0], VOIPnt[1], VOIPnt[2]) im_pt = [0, 0, 0] transformed_image.GetPoint(pixId, im_pt) inorout = transformed_image.ComputeStructuredCoordinates( im_pt, ijk, pco) if (inorout == 0): pass else: pixValx = transformed_image.GetScalarComponentAsFloat( ijk[0], ijk[1], ijk[2], 0) pixVals_margin.append(pixValx) # Now collect pixVals print "\n Saving %s" % 'Fmargin' + str(i) Fmargin['Fmargin' + str(i)] = pixVals_margin pixVals_margin = [] # extract pixID at inside VOIPnt VOI_scalars = transformed_image.GetPointData().GetScalars() np_VOI_imagedata = vtk_to_numpy(VOI_scalars) dims = transformed_image.GetDimensions() spacing = transformed_image.GetSpacing() np_VOI_imagedata = np_VOI_imagedata.reshape( dims[2], dims[1], dims[0]) np_VOI_imagedata = np_VOI_imagedata.transpose(2, 1, 0) #################### HERE GET INTERNAL PIXELS IT AND MASK IT OUT VOI_imagedata = np_VOI_imagedata[nonzero(np_VOI_mask)] for j in range(len(VOI_imagedata)): pixValx = VOI_imagedata[j] pixVals.append(pixValx) # Now collect pixVals print "\n Saving %s" % 'F' + str(i) voxel_frameS['F' + str(i)] = pixVals pixVals = [] ############################################################## # Initialize features self.i_var = [] self.alln_F_r_i = [] self.allmin_F_r_i = [] self.allmax_F_r_i = [] self.allmean_F_r_i = [] self.allvar_F_r_i = [] self.allskew_F_r_i = [] self.allkurt_F_r_i = [] F_r_0 = array(voxel_frameS['F' + str(0)]).astype(float) n, min_max, meanFr, var_F_r_0, skew, kurt = stats.describe(F_r_0) self.i_var_max = 0 # Collect to Compute inhomogeneity variance of uptake and other variables for k in range(1, len(DICOMImages)): F_r_i = array(voxel_frameS['F' + str(k)]).astype(float) print "\nF_r_i parameters %s" % str(k) n_F_r_i, min_max_F_r_i, mean_F_r_i, var_F_r_i, skew_F_r_i, kurt_F_r_i = stats.describe( F_r_i) print("Number of internal voxels: {0:d}".format(n_F_r_i)) self.alln_F_r_i.append(n_F_r_i) print("Minimum: {0:8.6f} Maximum: {1:8.6f}".format( min_max_F_r_i[0], min_max_F_r_i[1])) self.allmin_F_r_i.append(min_max_F_r_i[0]) self.allmax_F_r_i.append(min_max_F_r_i[1]) print("Mean: {0:8.6f}".format(mean_F_r_i)) self.allmean_F_r_i.append(mean_F_r_i) print("Variance F_r_i: {0:8.6f}".format(var_F_r_i)) self.allvar_F_r_i.append(var_F_r_i) print("Skew : {0:8.6f}".format(skew_F_r_i)) self.allskew_F_r_i.append(skew_F_r_i) print("Kurtosis: {0:8.6f}".format(kurt_F_r_i)) self.allkurt_F_r_i.append(kurt_F_r_i) print("Variance of uptake: {0:8.6f}".format(var_F_r_i / var_F_r_0)) self.i_var.append(var_F_r_i / var_F_r_0) # Find max of change in variance of uptake if (self.i_var[k - 1] > self.i_var_max): self.i_var_max = self.i_var[k - 1] print("\nMax Variance of uptake: {0:8.6f}\n".format(self.i_var_max)) # Collect to Compute change in variance of uptake self.ii_var = [] self.ii_var_min = 1000 for k in range(len(DICOMImages) - 1): F_r_i = array(voxel_frameS['F' + str(k)]).astype(float) F_r_iplus = array(voxel_frameS['F' + str(k + 1)]).astype(float) n, min_max, meanFr, var_F_r_ith, skew, kurt = stats.describe(F_r_i) n, min_max, meanFr, var_F_r_iplus, skew, kurt = stats.describe( F_r_iplus) """change Variance of uptake:""" self.ii_var.append(var_F_r_ith / var_F_r_iplus) # Find max of change in variance of uptake if (var_F_r_ith / var_F_r_iplus < self.ii_var_min): self.ii_var_min = var_F_r_ith / var_F_r_iplus print("Min change Variance of uptake: {0:8.6f}\n".format( self.ii_var_min)) # Extract features for sharpness of lesion margin, compute Margin gradient iii_var # The gradient is computed using convolution with a 3D sobel filter using scipy.ndimage.filters.sobel # The function generic_gradient_magnitude calculates a gradient magnitude using the function passed through derivative to calculate first derivatives. F_rmargin_0 = array(Fmargin['Fmargin' + str(0)]).astype(float) self.iii_var_max = -1000 iii_Sobelvar = [] # Collect to Compute variance of uptake and other variables for k in range(1, len(DICOMImages)): F_rmargin_i = array(Fmargin['Fmargin' + str(k)]).astype(float) margin_delta = F_rmargin_i - F_rmargin_0 # using first sobel and then prewitt sobel_grad_margin_delta = generic_gradient_magnitude( margin_delta, sobel) # compute feature Margin Gradient n, min_max, mean_sobel_grad_margin, var, skew, kurt = stats.describe( sobel_grad_margin_delta) n, min_max, mean_F_rmargin_i, var_F_r_ith, skew, kurt = stats.describe( F_rmargin_i) """Margin Gradient""" iii_Sobelvar.append(mean_sobel_grad_margin / mean_F_rmargin_i) # Find max of Margin Gradient if (iii_Sobelvar[k - 1] > self.iii_var_max): self.iii_var_max = iii_Sobelvar[k - 1] self.iii_var_max_k = k print("Max Margin Gradient: {0:8.6f}".format(self.iii_var_max)) print("k for Max Margin Gradient: {0:8.6f}".format(self.iii_var_max_k)) # compute iv feature Variance of Margin Gradient # note: only computed from the subtraction frames of i and 0 where the margin gradient iii_var is maximum. self.ivVariance = [] F_rmargin_iv = array(Fmargin['Fmargin' + str(self.iii_var_max_k)]).astype(float) n, min_max, mean_F_rmargin_iv, var_F_r_ith, skew, kurt = stats.describe( F_rmargin_iv) margin_delta_iv = F_rmargin_iv - F_rmargin_0 # using first sobel and then prewitt sobel_grad_margin_delta_iv = generic_gradient_magnitude( margin_delta_iv, sobel) n, min_max, mean_sobel, var_sobel_grad_margin_delta_iv, skew, kurt = stats.describe( sobel_grad_margin_delta_iv) self.ivVariance = var_sobel_grad_margin_delta_iv / mean_F_rmargin_iv**2 print("Variance of spatial Margin Gradient: {0:8.6f}".format( self.ivVariance)) # Extract Shape features: pre-requisite is the Volume and the diameter of the lesion #################################### # Measure VOI ################################### VOI_massProperty = vtk.vtkMassProperties() VOI_massProperty.SetInputData(VOI_mesh) VOI_massProperty.Update() # get VOI volume # VTK is unitless. The units you get out are the units you put in. # If your input polydata has points defined in terms of millimetres, then # the volume will be in cubic millimetres. VOI_vol = VOI_massProperty.GetVolume() # mm3 VOI_surface = VOI_massProperty.GetSurfaceArea() # mm2 # just print the results print "\nVolume lesion = ", VOI_vol print "Surface lesion = ", VOI_surface # Calculate the effective diameter of the surface D=2(sqrt3(3V/(4pi))) diam_root = (3 * VOI_vol) / (4 * pi) self.VOI_efect_diameter = 2 * pow(diam_root, 1.0 / 3) print "VOI_efect_diameter = ", self.VOI_efect_diameter centerOfMassFilter = vtk.vtkCenterOfMass() centerOfMassFilter.SetInputData(VOI_mesh) centerOfMassFilter.SetUseScalarsAsWeights(False) centerOfMassFilter.Update() # centroid of lesion self.lesion_centroid = [0, 0, 0] self.lesion_centroid = centerOfMassFilter.GetCenter() print "lesion_centroid = ", self.lesion_centroid # create a sphere to compute the volume of lesion within a sphere of effective diameter sphere_effectD = vtk.vtkSphereSource() sphere_effectD.SetRadius(self.VOI_efect_diameter / 2) #VOI_diameter/2 sphere_effectD.SetCenter(self.lesion_centroid) sphere_effectD.Update() # compute volume of lesion within a sphere of effective diameter sphereVOI_massProperty = vtk.vtkMassProperties() sphereVOI_massProperty.SetInputData(sphere_effectD.GetOutput()) sphereVOI_massProperty.Update() sphereVOI_vol = sphereVOI_massProperty.GetVolume() # mm3 # just print the results print "Volume sphereVOI = ", sphereVOI_vol # Compute Shape of lesion in 3D # Circularity epsilon = 0.001 self.circularity = sphereVOI_vol / (VOI_vol + epsilon) print("\nCircularity: {0:8.6f}".format(self.circularity)) self.irregularity = 1 - pi * (self.VOI_efect_diameter / VOI_surface) print("Irregularity: {0:8.6f}".format(self.irregularity)) #################################### # Radial gradient analysis ref[9] white paper ################################### # Radial gradient analysis is based on examination of the angles between voxel-value gradients # and lines intersecting a single point near the center of the suspect lesion, lines in radial directions. # Radial gradient values are given by the dot product of the gradient direction and the radial direction. RGH_mean = [] self.max_RGH_mean = 0 self.max_RGH_mean_k = 0 RGH_var = [] self.max_RGH_var = 0 self.max_RGH_var_k = 0 H_norm_p = [] # do subtraction of timepost-pre #################### for i in range(1, len(DICOMImages)): subtractedImage = Display().subImage(DICOMImages, i) [transformed_image, transform_cube ] = Display().dicomTransform(subtractedImage, image_pos_pat, image_ori_pat) for j in range(VOI_mesh.GetNumberOfPoints()): VOI_mesh.GetPoint(j, VOIPnt) r = array(VOIPnt) rc = array(self.lesion_centroid) norm_rdir = (r - rc) / linalg.norm(r - rc) # Find point for gradient vectors at the margin point pixId = transformed_image.FindPoint(VOIPnt[0], VOIPnt[1], VOIPnt[2]) sub_pt = [0, 0, 0] transformed_image.GetPoint(pixId, sub_pt) ijk = [0, 0, 0] pco = [0, 0, 0] grad_pt = [0, 0, 0] inorout = transformed_image.ComputeStructuredCoordinates( sub_pt, ijk, pco) if (inorout == 0): print "point outside data" else: transformed_image.GetPointGradient( ijk[0], ijk[1], ijk[2], transformed_image.GetPointData().GetScalars(), grad_pt) ############# # Compute vector in the direction gradient at margin point grad_marginpt = array([grad_pt]) norm_grad_marginpt = grad_marginpt / linalg.norm(grad_marginpt) # Compute dot product (unit vector for dot product) p_dot = dot(norm_grad_marginpt, norm_rdir) norm_p_dot = np.abs(p_dot)[0] #linalg.norm(p_dot) H_norm_p.append(norm_p_dot) # The histogram of radial gradient values quantifying the frequency of occurrence of the dot products in a given region of interest # radial gradient histogram. The hist() function now has a lot more options # first create a single histogram # the histogram of the data with histtype='step' # plt.figure() # nsamples, bins, patches = plt.hist(array(H_norm_p), 50, normed=1, histtype='bar',facecolor='blue', alpha=0.75) # n, min_max, mean_bins, var_bins, skew, kurt = stats.describe(nsamples) mean_bins = np.mean(H_norm_p) var_bins = np.var(H_norm_p) print("\n mean RGB: {0:8.6f}".format(mean_bins)) print("variance RGB: {0:8.6f}".format(var_bins)) # Append data RGH_mean.append(mean_bins) RGH_var.append(var_bins) # Find max of RGH Gradient if (RGH_mean[i - 1] > self.max_RGH_mean): self.max_RGH_mean = RGH_mean[i - 1] self.max_RGH_mean_k = i if (RGH_var[i - 1] > self.max_RGH_var): self.max_RGH_var = RGH_var[i - 1] self.max_RGH_var_k = i # add a line showing the expected distribution # create a histogram by providing the bin edges (unequally spaced) plt.xlabel('normalized dot product |R.G|') plt.ylabel('Probability') plt.title('radial gradient histogram') plt.grid(True) ################# Jacob's lesion margin sharpness #initializations VOI_outlinept_normal = [0, 0, 0] VOI_outlinept = [0, 0, 0] inpt = [0, 0, 0] outpt = [0, 0, 0] im_pts = [0, 0, 0] ijk_in = [0, 0, 0] ijk_out = [0, 0, 0] pco = [0, 0, 0] SIout_pixVal = [] lastSIout_pixVal = [] # get model_point_normals VOI_point_normals = vtk.vtkPolyDataNormals() VOI_point_normals.SetInputData(VOI_mesh) VOI_point_normals.SetComputePointNormals(1) VOI_point_normals.SetComputeCellNormals(0) VOI_point_normals.SplittingOff() VOI_point_normals.FlipNormalsOff() VOI_point_normals.ConsistencyOn() VOI_point_normals.Update() # Retrieve model normals VOI_normalsRetrieved = VOI_point_normals.GetOutput().GetPointData( ).GetNormals() VOI_n = VOI_normalsRetrieved.GetNumberOfTuples() # obtain vols of interest [transf_pre_dicomReader, transform_cube] = Display().dicomTransform(DICOMImages[0], image_pos_pat, image_ori_pat) [transf_last_dicomReader, transform_cube ] = Display().dicomTransform(DICOMImages[len(DICOMImages) - 1], image_pos_pat, image_ori_pat) num_margin = [] den_margin = [] for i in range(1, len(DICOMImages)): #initializations SIout_pixVal = [] lastSIout_pixVal = [] subtractedImage = Display().subImage(DICOMImages, i) [transf_sub_pre_dicomReader, transform_cube ] = Display().dicomTransform(subtractedImage, image_pos_pat, image_ori_pat) for k in range(VOI_n): VOI_outlinept_normal = VOI_normalsRetrieved.GetTuple3(k) VOI_mesh.GetPoint(k, VOI_outlinept) # "d for radial lenght: %f" % d d = sqrt(spacing[0]**2 + spacing[1]**2 + spacing[2]**2) inpt[0] = VOI_outlinept[0] - VOI_outlinept_normal[0] * d inpt[1] = VOI_outlinept[1] - VOI_outlinept_normal[1] * d inpt[2] = VOI_outlinept[2] - VOI_outlinept_normal[2] * d outpt[0] = VOI_outlinept[0] + VOI_outlinept_normal[0] * d outpt[1] = VOI_outlinept[1] + VOI_outlinept_normal[1] * d outpt[2] = VOI_outlinept[2] + VOI_outlinept_normal[2] * d # get pre-contrast SIin to normalized RSIgroup [See equation 1] from paper prepixin = transf_pre_dicomReader.FindPoint( inpt[0], inpt[1], inpt[2]) transf_pre_dicomReader.GetPoint(prepixin, im_pts) transf_pre_dicomReader.ComputeStructuredCoordinates( im_pts, ijk_in, pco) #print ijk_in # get pre-contrast SIout in 6-c-neighbordhood to normalized RSIgroup [See equation 1] from paper prepixout = transf_pre_dicomReader.FindPoint( outpt[0], outpt[1], outpt[2]) transf_pre_dicomReader.GetPoint(prepixout, im_pts) transf_pre_dicomReader.ComputeStructuredCoordinates( im_pts, ijk_out, pco) #print ijk_out # get t-post SIin SIin_pixVal = transf_sub_pre_dicomReader.GetScalarComponentAsFloat( ijk_in[0], ijk_in[1], ijk_in[2], 0) preSIin_pixVal = transf_pre_dicomReader.GetScalarComponentAsFloat( ijk_in[0], ijk_in[1], ijk_in[2], 0) + epsilon RSIin = SIin_pixVal / preSIin_pixVal #### # get t-post SIout 6-c-neighbordhood #cn1 SIout = transf_sub_pre_dicomReader.GetScalarComponentAsFloat( ijk_out[0] + 1, ijk_out[1], ijk_out[2], 0) preSIout = transf_pre_dicomReader.GetScalarComponentAsFloat( ijk_out[0] + 1, ijk_out[1], ijk_out[2], 0) + epsilon SIout_pixVal.append(float(SIout / preSIout)) #cn2 SIout = transf_sub_pre_dicomReader.GetScalarComponentAsFloat( ijk_out[0] - 1, ijk_out[1], ijk_out[2], 0) preSIout = transf_pre_dicomReader.GetScalarComponentAsFloat( ijk_out[0] - 1, ijk_out[1], ijk_out[2], 0) + epsilon SIout_pixVal.append(float(SIout / preSIout)) #cn3 SIout = transf_sub_pre_dicomReader.GetScalarComponentAsFloat( ijk_out[0], ijk_out[1] + 1, ijk_out[2], 0) preSIout = transf_pre_dicomReader.GetScalarComponentAsFloat( ijk_out[0], ijk_out[1] + 1, ijk_out[2], 0) + epsilon SIout_pixVal.append(float(SIout / preSIout)) #cn4 SIout = transf_sub_pre_dicomReader.GetScalarComponentAsFloat( ijk_out[0], ijk_out[1] - 1, ijk_out[2], 0) preSIout = transf_pre_dicomReader.GetScalarComponentAsFloat( ijk_out[0], ijk_out[1] - 1, ijk_out[2], 0) + epsilon SIout_pixVal.append(float(SIout / preSIout)) #cn5 SIout = transf_sub_pre_dicomReader.GetScalarComponentAsFloat( ijk_out[0], ijk_out[1], ijk_out[2] + 1, 0) preSIout = transf_pre_dicomReader.GetScalarComponentAsFloat( ijk_out[0], ijk_out[1], ijk_out[2] + 1, 0) + epsilon SIout_pixVal.append(float(SIout / preSIout)) #cn6 SIout = transf_sub_pre_dicomReader.GetScalarComponentAsFloat( ijk_out[0], ijk_out[1] - 1, ijk_out[2] - 1, 0) preSIout = transf_pre_dicomReader.GetScalarComponentAsFloat( ijk_out[0], ijk_out[1] - 1, ijk_out[2] - 1, 0) + epsilon SIout_pixVal.append(float(SIout / preSIout)) RSIout = mean(SIout_pixVal) ### # get last-post SIout 6-c-neighbordhood #cn1 SIout = transf_last_dicomReader.GetScalarComponentAsFloat( ijk_out[0] + 1, ijk_out[1], ijk_out[2], 0) preSIout = transf_pre_dicomReader.GetScalarComponentAsFloat( ijk_out[0] + 1, ijk_out[1], ijk_out[2], 0) + epsilon lastSIout_pixVal.append(float(SIout / preSIout)) #cn2 SIout = transf_last_dicomReader.GetScalarComponentAsFloat( ijk_out[0] - 1, ijk_out[1], ijk_out[2], 0) preSIout = transf_pre_dicomReader.GetScalarComponentAsFloat( ijk_out[0] - 1, ijk_out[1], ijk_out[2], 0) + epsilon lastSIout_pixVal.append(float(SIout / preSIout)) #cn3 SIout = transf_last_dicomReader.GetScalarComponentAsFloat( ijk_out[0], ijk_out[1] + 1, ijk_out[2], 0) preSIout = transf_pre_dicomReader.GetScalarComponentAsFloat( ijk_out[0], ijk_out[1] + 1, ijk_out[2], 0) + epsilon lastSIout_pixVal.append(float(SIout / preSIout)) #cn4 SIout = transf_last_dicomReader.GetScalarComponentAsFloat( ijk_out[0], ijk_out[1] - 1, ijk_out[2], 0) preSIout = transf_pre_dicomReader.GetScalarComponentAsFloat( ijk_out[0], ijk_out[1] - 1, ijk_out[2], 0) + epsilon lastSIout_pixVal.append(float(SIout / preSIout)) #cn5 SIout = transf_last_dicomReader.GetScalarComponentAsFloat( ijk_out[0], ijk_out[1], ijk_out[2] + 1, 0) preSIout = transf_pre_dicomReader.GetScalarComponentAsFloat( ijk_out[0], ijk_out[1], ijk_out[2] + 1, 0) + epsilon lastSIout_pixVal.append(float(SIout / preSIout)) #cn6 SIout = transf_last_dicomReader.GetScalarComponentAsFloat( ijk_out[0], ijk_out[1] - 1, ijk_out[2] - 1, 0) preSIout = transf_pre_dicomReader.GetScalarComponentAsFloat( ijk_out[0], ijk_out[1] - 1, ijk_out[2] - 1, 0) + epsilon lastSIout_pixVal.append(float(SIout / preSIout)) # calculate RSIoutf = mean(lastSIout_pixVal) ### compute feature num_margin.append(RSIin - RSIout) den_margin.append(RSIin - RSIoutf) #print num_margin #print den_margin SIout_pixVal = [] lastSIout_pixVal = [] self.edge_sharp_mean = mean(array(num_margin).astype(float)) / mean( array(den_margin).astype(float)) self.edge_sharp_std = std(array(num_margin).astype(float)) / std( array(den_margin).astype(float)) print "\n edge_sharp_mean: " print self.edge_sharp_mean print "\n edge_sharp_std: " print self.edge_sharp_std ################################################## # orgamize into dataframe self.morphologyFeatures = DataFrame( data=array([[ mean(self.allmin_F_r_i), mean(self.allmax_F_r_i), mean(self.allmean_F_r_i), mean(self.allvar_F_r_i), mean(self.allskew_F_r_i), mean(self.allkurt_F_r_i), self.i_var_max, self.ii_var_min, self.iii_var_max, self.iii_var_max_k, self.ivVariance, self.circularity, self.irregularity, self.edge_sharp_mean, self.edge_sharp_std, self.max_RGH_mean, self.max_RGH_mean_k, self.max_RGH_var, self.max_RGH_var_k ]]), columns=[ 'min_F_r_i', 'max_F_r_i', 'mean_F_r_i', 'var_F_r_i', 'skew_F_r_i', 'kurt_F_r_i', 'iMax_Variance_uptake', 'iiMin_change_Variance_uptake', 'iiiMax_Margin_Gradient', 'k_Max_Margin_Grad', 'ivVariance', 'circularity', 'irregularity', 'edge_sharp_mean', 'edge_sharp_std', 'max_RGH_mean', 'max_RGH_mean_k', 'max_RGH_var', 'max_RGH_var_k' ]) return self.morphologyFeatures
from scipy import stats # Generating a normal distribution sample # with 100 elements sample = np.random.randn(100) # The harmonic mean: Sample values have to # be greater than 0. out = stats.hmean(sample[sample > 0]) print('Harmonic mean = ' + str(out)) # The mean, where values below -1 and above 1 are # removed for the mean calculation out = stats.tmean(sample, limits=(-1, 1)) print('\nTrimmed mean = ' + str(out)) # Calculating the skewness of the sample out = stats.skew(sample) print('\nSkewness = ' + str(out)) # Additionally, there is a handy summary function called # describe, which gives a quick look at the data. out = stats.describe(sample) print('\nSize = ' + str(out[0])) print('Min = ' + str(out[1][0])) print('Max = ' + str(out[1][1])) print('Mean = ' + str(out[2])) print('Variance = ' + str(out[3])) print('Skewness = ' + str(out[4])) print('Kurtosis = ' + str(out[5]))
def five_figure_summary(self,col_position): statistics = stats.describe(self.array[1:,col_position].astype(np.float)) return f"Five-figure stats of column {col_position}: {statistics}"
"./Outputs/avg_fare_info/1/model for fleet size 1500 surge 2fdemand 0.0perc_k 0pro_s 0 repl0.p", "rb", ) ) report = m.get_service_rate_per_zone() report report.LOS.describe() print("total_demand = {}".format(report.total.sum())) total_demand = 20000 system_LOS = report.served.sum() / total_demand system_LOS np.sum(m.operator.revenues) drivers_fares = [np.sum(v.collected_fares) for v in m.vehilcs] stats.describe(drivers_fares) np.median(drivers_fares) # print("vehicle utilization = {}".format(report.idle.sum()/(report.idle.sum() + report.incoming.sum()))) z = m.zones[10] l = [z.id for z in m.zones] l.index(236) directory = "./Outputs/zone_demand_viz/" if not os.path.exists(directory): os.makedirs(directory)
def __init__(self, samples): # Now compute the pedestals from scipy.stats import describe from numpy import floor # Did we receive any samples? if not len(samples): raise Exception("Did not receive any samples!") # Some board information self.board_id = samples[0].board_id # Use the first event to determine the list of channels self.chan_list = samples[0].channels.keys() # Sanity checks on the pedestal samples for sample in samples: if not isinstance(sample, event): raise Exception( "Encountered non-event in list of samples. Nonsense.") if not self.chan_list == sample.channels.keys(): raise Exception( "Provided sample events for pedestaling have inhomogeneous channel content. This .... is ... U N A C C E P T A B L E E E E ---- U N A C C E P T A B L E E E E E E E" ) if not sample.keep_offset: raise Exception( "Refusing to build pedestals with torn data (ROI mode)") # Set up for pedestals self.mean = {} self.variance = {} # Gotta do it manually for chan_id in self.chan_list: caps = [[] for x in range(0, len(samples[0].channels[chan_id]))] self.mean[chan_id] = [] self.variance[chan_id] = [] for evt in samples: for capacitor, ampl in enumerate(evt.channels[chan_id]): if not ampl is None: caps[capacitor].append(ampl) # Now we have it in filtered capacitor form for cap in caps: # Cap is a list N = len(cap) if N == 0: self.mean[chan_id].append(None) self.variance[chan_id].append(None) elif N == 1: self.mean[chan_id].append(cap) self.variance[chan_id].append(None) else: bs, bs, mean, variance, *bs = describe(cap) # # Because the numpy method returns some ass-retarded type # # Note that we save the mean as an integer. # This lets us do integer subtraction without conversion when removing pedestals # directly from the data as it comes in. # # Since noise is a least 30 ADC counts, this changes nothing. # self.mean[chan_id].append(round(float(mean))) self.variance[chan_id].append(float(variance)) # Remove the samples because python can't pickle it del (self.chan_list)
with open('SalaryData.csv', 'r') as csvfile: read = csv.reader(csvfile, delimiter=',', quotechar='|') f = 0 for row in read: if f != 0: Data.append(row) f = f + 1 Exp = [] TExp = [] Sal = [] for i in Data: Exp.append(int(i[2])) TExp.append(int(i[4])) Sal.append(int(i[5])) print("Stats of Experience: ") print(stats.describe(Exp)) print("Stats of Total Experience: ") print(stats.describe(TExp)) print("Stats of Salary: ") print(stats.describe(Sal)) fig = plt.figure() fig.suptitle('Experience V/S Salary', fontsize=14, fontweight='bold') plt.scatter(Exp, Sal) Sal = np.array(Sal) Exp = np.array(Exp) popt, pcov = curve_fit(func, Exp, Sal) SSR = (sum((func(Exp, *popt) - Sal)**2) / Exp.size) print(SSR) plt.plot(np.unique(Exp), np.poly1d(np.polyfit(Exp, Sal, 3))(np.unique(Exp))) #plt.plot(Exp, func(Exp, *popt)) plt.xlabel("Experience")
print(zarr) #정수 생성 cnt = 0 for i in np.arange(3): for j in np.arange(5): cnt += 1 zarr[i, j] = cnt print(zarr) #외부 csv파일 읽어 배열 생성 phone = np.genfromtxt('c:/Java/phone-01.csv',delimiter=',') #텍스트파일을 배열로 생성 print(phone) print(np.mean(phone[:,2])) #화면크기 항목에 대한 평균 출력 print(np.median(phone[:,2])) #중앙값 print('총 개수 : ', len(phone)) p_col3 = phone[:,2] print(np.percentile(p_col3, 0)) #사분위값 : 최소값 print(np.percentile(p_col3, 25)) #1사분위값 print(np.percentile(p_col3, 50)) #2사분위값 print(np.percentile(p_col3, 75)) #3사분위값 print(np.percentile(p_col3, 100)) #사분위값 : 최대값 #scipy에는 여러가지 기술통계를 한번에 계산해주는 #describe 함수가 있음 from scipy.stats import describe print(describe(phone))
from scipy import stats import numpy as np s = np.genfromtxt('Brrrr.log', dtype='float') #print(s) s2 = np.array(s) x2, x1, b = s2[:, 0], s2[:, 1], s2[:, 2] ''' print(x2) print() print(b) ''' print(stats.describe(x2)) print() print(stats.describe(x1)) print() print(stats.describe(b)) print() from math import sqrt a = -.00256515 + sqrt(.00069877876) b = 3.805212 + sqrt(1338.377) c = -242.17783 + sqrt(160203168.52084109) print("a:", a) print("b:", b) print("c:", c)
def main(): #Path to config file _, config_path = deepcopy(sys.argv) with open(config_path, "r") as f: config = json.load(f) data_dir = config["data_dir"] if not os.path.isdir(data_dir): os.mkdir(data_dir) specs = config["specifications"] with open(os.path.join(data_dir, "config_copy.json"), 'w', newline='') as f: f.write(json.dumps(config)) vol_dat = [] run_num = 0 for run_vals in specs: concentrations = run_vals["concentrations"] sample_volume = run_vals["sample_volume"] sample_diffusive_const = run_vals["sample_diffusive_const"] num_timesteps = run_vals["Number of timesteps"] molecular_radius = run_vals["Molecular Radius"] min_droplet_vol = run_vals["Min Droplet Volume"] num_droplets = [] means = [] for c in concentrations: print("Started run for concentration " + str(c) + "uM", flush=True) sample = Sample(sample_volume, sample_diffusive_const, c, molecular_radius, min_droplet_vol) aggs = sample.simulate(num_timesteps) volumes = [] for agg in aggs: if agg.is_droplet(): volumes.append(agg.volume()) vol_dat += [volumes] if len(volumes) == 0: num_droplets.append(0) means.append(0) else: nobs, minmax, mean, variance, skewness, kurtosis = sp.describe( volumes) num_droplets.append(nobs) means.append(mean) print("Finished run for concentration " + str(c) + "uM", flush=True) run_num += 1 run_dir = os.path.join(data_dir, "run" + str(run_num)) os.mkdir(run_dir) with open(os.path.join(run_dir, "volumes_dat.csv"), 'w', newline='') as csvfile: writer = csv.writer(csvfile, delimiter=',') writer.writerows(vol_dat) plt.rc('font', family='serif') fig = plt.figure(figsize=(10, 10)) ax = fig.add_subplot(1, 1, 1) for item in (ax.get_xticklabels() + ax.get_yticklabels()): item.set_fontsize(20) for item in ([ax.title, ax.xaxis.label, ax.yaxis.label]): item.set_fontsize(30) ax.plot(concentrations, num_droplets, "sb:") ax.set_title("Number of Droplets") ax.set_xlabel("Concentration (uM)") ax.set_ylabel("Number of Droplets") plt.savefig(os.path.join(run_dir, 'num_droplets.png'), bbox_inches='tight') fig = plt.figure(figsize=(10, 10)) ax = fig.add_subplot(1, 1, 1) for item in (ax.get_xticklabels() + ax.get_yticklabels()): item.set_fontsize(20) for item in ([ax.title, ax.xaxis.label, ax.yaxis.label]): item.set_fontsize(30) ax.plot(concentrations, means, "sb:") ax.set_title("Mean Droplet Volume") ax.set_xlabel("Concentration (uM)") ax.set_ylabel("Mean Droplet Volume (um^3)") plt.savefig(os.path.join(run_dir, 'mean_droplet_volume.png'), bbox_inches='tight')
# 연도 읽을 때 에러 처리. 파일 헤더를 무시합니다. try: invoice_year = time.strptime(line_items[4], '%m/%d/%y %H:%M').tm_year except ValueError: continue # 2011년에 일어난 구매가 아닌 것은 무시합니다. if invoice_year != 2011: continue # 읽은 정보로 데이터 구조를 채웁니다. # 상품 가짓수를 고려하므로 상품 코드를 셋으로 가지도록 하겠습니다. user_product_dic.setdefault(user_code, set()) user_product_dic[user_code].add(product_id) product_user_dic.setdefault(product_id, set()) product_user_dic[product_id].add(user_code) product_id_name_dic[product_id] = product_name # 데이터구조를 다 채웠으므로 각 사용자들이 구매한 상품 가짓수로 리스트를 만들어봅시다. product_per_user_li = [len(x) for x in user_product_dic.values()] # 이 장에서 사용할 최종 사용자 수와 상품 가짓수를 출력해봅니다. print('# of users:', len(user_product_dic)) print('# of products:', len(product_user_dic)) # 각 사용자들이 구매한 상품 가짓수로 기초 통계량을 출력합니다. print(stats.describe(product_per_user_li))
def fxHighStressTest(port0, baudRate, port1 = "", commandFreq = 1000, positionAmplitude = 10000, currentAmplitude = 2500, positionFreq = 1, currentFreq = 5, currentAsymmetricG = 1.25, numberOfLoops = 720): global times # Elapsed time since strart of run global currentRequests global positionRequests global readDeviceTimes # Timing data for fxReadDevice() global sendMotorTimes # Timing data for fxSendMotorCommand global setGainsTimes # Timing data for fxSetGains() global cycleStopTimes global data0 # Contains state of ActPack0 global data1 # Contains state of ActPack1 ########### One vs two devices ############ secondDevice = False if (port1 != ""): secondDevice = True if (secondDevice): print("Running high stress test with two devices") else: print("Running high stress test with one device") ########### Debug & Data Logging ############ debugLoggingLevel = 6 # 6 is least verbose, 0 is most verbose dataLog = False # Data log logs device data delay_time = float(1/(float(commandFreq))) print('Delay time: ', delay_time) ########### Open the device(s) and start streaming ############ devId0 = fxOpen(port0, baudRate, debugLoggingLevel) fxStartStreaming(devId0, commandFreq, dataLog) print('Connected to device with Id:', devId0) devId1 = -1 if (secondDevice): print('Port: ', port1) print('BaudRate: ', baudRate) print('debugLoggingLevel: ', debugLoggingLevel) devId1 = fxOpen(port1, baudRate, debugLoggingLevel) fxStartStreaming(devId1, commandFreq, dataLog) print('Connected to device with Id:', devId1) ############# Main Code ############ ######## Make your changes here ######### # Get initial position: print('Reading initial position...') # Give the device time to consume the startStreaming command and start streaming sleep(0.1) data0 = fxReadDevice(devId0) initialPos0 = data0.encoderAngle # May be used to offset subsequent readings print("Initial position 0:", initialPos0) initialPos1 = 0 if (secondDevice): data1 = fxReadDevice(devId1) initialPos1 = data1.encoderAngle print("Initial position 1:", initialPos1) # Generate control profiles print('Command table #1 - Position Sine:') positionSamples = sinGenerator(positionAmplitude, positionFreq, commandFreq) print(np.int64(positionSamples)) print('Command table #2 - Current Sine:') currentSamples = sinGenerator(currentAmplitude, currentFreq, commandFreq) print("number of samples is: ", len(currentSamples)) print(np.int64(currentSamples)) print('Command table #3 - Current Sine:') currentSamplesLine = lineGenerator(0, 0.15, commandFreq) #print(np.int64(currentSamplesLine)) # Initialize lists # cycleStopTimes = [] try: t0 = time() # Record start time of experiment i = 0 for reps in range(0, numberOfLoops): print("") print("Rep #", reps+1,"out of",numberOfLoops) print("-------------------") # Step 0: set position controller # ------------------------------- print("Step 0: set position controller") sleep(delay_time) # Important in loop 2+ if (i): # Second or later iterations in loop # setPositionCtrl( devId0, devId1, secondDevice, data0.encoderAngle, initialPos1) sendAndTimeCmds(t0, devId0, devId1, secondDevice, initialPos0, initialPos1, current0=0, current1=0, motorCmd=FxPosition, position0=data0.encoderAngle, position1=initialPos1, posReq=0, setGains=True) # ToDo: data1.encoderAngle else: # First loop iteration # setPositionCtrl( devId0, devId1, secondDevice, initialPos0, initialPos1) sendAndTimeCmds(t0, devId0, devId1, secondDevice, initialPos0=0, initialPos1=0, current0=0, current1=0, motorCmd=FxPosition, position0=initialPos0, position1=initialPos1, posReq=0, setGains=True) # Step 1: go to initial position # ------------------------------- if (i): # Second or later iterations in loop print("Step 1: go to initial position") linSamples = linearInterp(data0.encoderAngle-initialPos0, 0, 100) #print(np.int64(linSamples)) for sample in linSamples: sleep(delay_time) sendAndTimeCmds(t0, devId0, devId1, secondDevice, initialPos0, initialPos1, current0=0, current1=0, motorCmd=FxPosition, position0=sample+initialPos0, position1=sample+initialPos1, posReq=sample, setGains=False) """ # set controller to the next sample # read ActPack data tstart = time() data0 = fxReadDevice(devId0) tstop = time() readDeviceTimes.append(tstop - tstart) if (secondDevice): data1 = fxReadDevice(devId1) # Position setpoint: tstart = time() fxSendMotorCommand(devId0, FxPosition, sample + initialPos0) tstop = time() sendMotorTimes.append(tstop - tstart) currentMeasurements0.append(data0.motorCurrent) positionMeasurements0.append(data0.encoderAngle - initialPos0) if (secondDevice): fxSendMotorCommand(devId1, FxPosition, sample + initialPos1) currentMeasurements1.append(data1.motorCurrent) positionMeasurements1.append(data1.encoderAngle-initialPos1) times.append(time() - t0) currentRequests.append(0) positionRequests.append(sample) # BAB: sample+initialPos0 ??? """ i = i + 1 else: # First time in loop print("Step 1: skipped, first round") # Step 2: position sine wave # -------------------------- print("Step 2: track position sine wave") for sample in positionSamples: sleep(delay_time) sendAndTimeCmds(t0, devId0, devId1, secondDevice,initialPos0, initialPos1, current0=0, current1=0, motorCmd=FxPosition, position0=sample+initialPos0, position1=sample+initialPos1, posReq=0, setGains=False) """ # set controller to the next sample # read ActPack data tstart = time() data0 = fxReadDevice(devId0) tstop = time() readDeviceTimes.append(tstop - tstart) if (secondDevice): data1 = fxReadDevice(devId1) # Position setpoint: tstart = time() fxSendMotorCommand(devId0, FxPosition, sample + initialPos0) tstop = time() sendMotorTimes.append(tstop - tstart) currentMeasurements0.append(data0.motorCurrent) positionMeasurements0.append(data0.encoderAngle - initialPos0) if (secondDevice): fxSendMotorCommand(devId1, FxPosition, sample + initialPos1) currentMeasurements1.append(data1.motorCurrent) positionMeasurements1.append(data1.encoderAngle - initialPos1) times.append(time() - t0) currentRequests.append(0) positionRequests.append(sample) # BAB: sample+initialPos0 ??? """ i = i + 1 # Step 3: set current controller # ------------------------------- print("Step 3: set current controller") # setCurrentCtrl( devId0, devId1, secondDevice, 0, 0) sendAndTimeCmds(t0, devId0, devId1, secondDevice, initialPos0, initialPos1, current0=0, current1=0, motorCmd=FxCurrent, position0=0, position1=0, posReq=0, setGains=True) # Step 4: current setpoint # -------------------------- print("Step 4: track current sine wave") for sample in currentSamples: sleep(delay_time) # We use more current on the "way back" to come back closer to # the staring point if(sample <= 0): #No change compensatedSample = sample else: #Apply gain compensatedSample = np.int64(currentAsymmetricG * sample) sendAndTimeCmds(t0, devId0, devId1, secondDevice,initialPos0, initialPos1, current0=compensatedSample, current1=compensatedSample, motorCmd=FxCurrent, position0=0, position1=0, posReq=0, setGains=False) # set controller to the next sample # read ActPack data """ tstart = time() data0 = fxReadDevice(devId0) tstop = time() readDeviceTimes.append(tstop - tstart) if (secondDevice): data1 = fxReadDevice(devId1) # Position setpoint: tstart = time() fxSendMotorCommand(devId0, FxCurrent, compensatedSample) tstop = time() sendMotorTimes.append(tstop - tstart) currentMeasurements0.append(data0.motorCurrent) positionMeasurements0.append(data0.encoderAngle - initialPos0) if (secondDevice): fxSendMotorCommand(devId1, FxCurrent, compensatedSample) currentMeasurements1.append(data1.motorCurrent) positionMeasurements1.append(data1.encoderAngle - initialPos1) times.append(time() - t0) currentRequests.append(compensatedSample) positionRequests.append(0) """ i = i + 1 # Step 5: short pause at 0 current to allow a slow-down # ----------------------------------------------------- print("Step 5: motor slow-down, zero current") for sample in currentSamplesLine: sleep(delay_time) sendAndTimeCmds(t0, devId0, devId1, secondDevice,initialPos0, initialPos1, current0=sample, current1=sample, motorCmd=FxCurrent, position0=0, position1=0, posReq=0, setGains=False) """ # set controller to the next sample # read ActPack data tstart = time() data0 = fxReadDevice(devId0) tstop = time() readDeviceTimes.append(tstop - tstart) if (secondDevice): data1 = fxReadDevice(devId1) # Position setpoint: tstart = time() fxSendMotorCommand(devId0, FxCurrent, sample) tstop = time() sendMotorTimes.append(tstop - tstart) currentMeasurements0.append(data0.motorCurrent) positionMeasurements0.append(data0.encoderAngle - initialPos0) if (secondDevice): fxSendMotorCommand(devId1, FxCurrent, sample) currentMeasurements1.append(data1.motorCurrent) positionMeasurements1.append(data1.encoderAngle - initialPos1) times.append(time() - t0) currentRequests.append(sample) positionRequests.append(0) """ i = i + 1 # We'll draw a line at the end of every period cycleStopTimes.append(time() - t0) elapsed_time = time() - t0 except KeyboardInterrupt: print ('Keypress detected. Exiting gracefully ...') fxClose(devId0) fxClose(devId1) ######## Stats: ######### print("") print("Final Stats:") print("------------") actual_period = cycleStopTimes[0] command_frequency = i / elapsed_time print("Number of commands sent: " + str(i)) print("Total time (s): " + str(elapsed_time)) print("Requested command frequency: "+"{:.2f}".format(commandFreq)) print("Actual command frequency (Hz): "+"{:.2f}".format(command_frequency)) print("") print('currentSamplesLine: ', len(currentSamplesLine)) print('size(times)', len(times)) print('size(currentRequests): ', len(currentRequests)) print('size(currentMeasurements0): ', len(currentMeasurements0)) print('size(setGainsTimes): ', len(setGainsTimes)) print('') ######## Summary stats about intividual arrays: ######### print('\n\ntimes: ', stats.describe(times)) print('\n\ncurrentRequests: ', stats.describe(currentRequests)) print('\n\ncurrentMeasurements0: ', stats.describe(currentMeasurements0)) # print('\n\ncurrentMeasurements1: ', stats.describe(currentMeasurements1)) print('\n\npositionRequests: ', stats.describe(positionRequests)) print('\n\npositionMeasurements0: ', stats.describe(positionMeasurements0)) # print('\n\npositionMeasurements1: ', stats.describe(positionMeasurements1)) print('\n\nreadDeviceTimes: ', stats.describe(readDeviceTimes)) print('\n\nsendMotorTimes: ', stats.describe(sendMotorTimes)) print('\n\nseetGainsTimes: ', stats.describe(setGainsTimes)) ######## End of Main Code ######### ######## Plotting Code, you can edit this ################## ###### Begin Create unique data filename and save desired and measured values now = datetime.now().strftime("%Y-%m-%d_%H-%M") data_fn = 'log/' + now + '_Current.csv' print('Do create Current data file ['+ data_fn + ']') # NON-PYTHONIC, but efficient write to file: # with open(data_fn, 'w') as df: # for i in range(len(currentRequests)): # df.write(str(times[i]) + ',' + str(currentRequests[i]) + ',' # + str(currentMeasurements0[i]) + '\n') data_fn = 'log/' + now + '_Position.csv' print('Do create Position data file ['+ data_fn + ']') # with open(data_fn, 'w') as df: # for i in range(len(positionRequests)): # df.write(str(times[i]) + ',' + str(positionRequests[i]) + ',' # + str(positionMeasurements0[i]) + '\n') ###### End Create unique data filename and save desired and measured values # Current Plot: plt.figure(1) title = "Motor Current" plt.plot(times, currentRequests, color = 'b', label = 'desired current') plt.plot(times, currentMeasurements0, color = 'r', label = 'measured current') plt.xlabel("Time (s)") plt.ylabel("Motor current (mA)") plt.title(title) plt.legend(loc='upper right') # Draw a vertical line at the end of each cycle for endpoints in cycleStopTimes: plt.axvline(x=endpoints) # Position Plot: plt.figure(2) title = "Motor Position" plt.plot(times, positionRequests, color = 'b', label = 'desired position') plt.plot(times, positionMeasurements0, color = 'r', label = 'measured position') plt.xlabel("Time (s)") plt.ylabel("Encoder position") plt.title(title) plt.legend(loc='upper right') # Draw a vertical line at the end of each cycle for endpoints in cycleStopTimes: plt.axvline(x=endpoints) plt.figure(3) # Convert command times into millisec sendMotorTimes = [i * 1000 for i in sendMotorTimes] plt.plot(times, sendMotorTimes, color='b', label='Send Motor Times') plt.xlabel("Time (ms)") plt.ylabel("Command Time (ms)") plt.title("Send Motor Times") plt.legend(loc='upper right') plt.figure(4) plt.yscale('log') plt.hist(sendMotorTimes, bins=100, label = 'Send Motor Times') plt.yscale('log') plt.xlabel("Time (ms)") plt.ylabel("Occurrences") plt.title("Send Motor Commands") plt.legend(loc='upper right') plt.figure(5) # Convert command times into millisec readDeviceTimes = [i * 1000 for i in readDeviceTimes] plt.plot(times, readDeviceTimes, color='b', label='Read Device Times') plt.xlabel("Time (ms)") plt.ylabel("Command Time (ms)") plt.title("Read Device Commands") plt.legend(loc='upper right') plt.figure(6) plt.yscale('log') plt.hist(readDeviceTimes, bins=100, label = 'Read Device Times') plt.yscale('log') plt.xlabel("Time (ms)") plt.ylabel("Occurrences") plt.title("Read Device Commands") plt.legend(loc='upper right') plt.figure(7) # Convert command times into millisec setGainsTimes = [i * 1000 for i in setGainsTimes] plt.plot(times, setGainsTimes, color='b', label='Set Gains Times') plt.xlabel("Time (ms)") plt.ylabel("Command Time (ms)") plt.title("Set Gains Commands") plt.legend(loc='upper right') plt.figure(8) plt.yscale('log') # Remove 0 values in histogram setGainsTimes = [i for i in setGainsTimes if i > 0] plt.hist(setGainsTimes, bins=100, label = 'Set Gains Times') plt.yscale('log') plt.xlabel("Time (ms)") plt.ylabel("Occurrences") plt.title("Set Gains Commands") plt.legend(loc='upper right') # ####### # *** ToDo: add plotting for 2nd device here *** # ####### plt.show()
dset_sbert_data = dset_sbert.remove_columns(['dataset', 'identifier', 'length', 'text']) dset_sbert_tfidf = concatenate_datasets([dsets_tokenized, dset_sbert_data], axis=1) import torch def mean_sbert(x): t = torch.tensor(x["sbert_top_128"]) t16 = t[:, :16].mean(1) t128 = t.mean(1) return dict(sbert_top_16_avg=t16.tolist(), sbert_top_128_avg=t128.tolist()) dset_sbert_tfidf = dset_sbert_tfidf.map(mean_sbert, batched=True, batch_size=4096) dset_sbert_tfidf.save_to_disk("/home/ahemf/processed/dsets_448_sbert_tfidf") sbert_tfidf = dset_sbert_tfidf.remove_columns(['sbert', 'sbert_top_128']) sbert_tfidf.save_to_disk("/home/ahemf/processed/sbert_tfidf") sum(sbert_tfidf["length"]) / 1_000_000_000 == 9.504 sum(sbert_tfidf["length"]) == 9504256152 from scipy.stats import describe describe(sbert_tfidf["perplexity"]) # DescribeResult(nobs=8252138, minmax=(1.0, 2666.609619140625), mean=50.22365915303506, variance=378.6735535134215, skewness=5.66997263818954, kurtosis=204.6866296365421) describe(np.log1p(sbert_tfidf["perplexity"])) # DescribeResult(nobs=8252138, minmax=(0.6931471805599453, 7.888938076667314), mean=3.8780924745662606, variance=0.11594320687589393, skewness=-0.2525622031627243, kurtosis=3.5463577790908074) describe(sbert_tfidf["sbert_top_128_avg"]) # DescribeResult(nobs=8252138, minmax=(0.23806016147136688, 0.9999999403953552), mean=0.48508123392669744, variance=0.00398569604138435, skewness=1.0084712141737993, kurtosis=3.5581830353889874) import pandas as pd pd.Series(sbert_tfidf["sbert_top_128_avg"]).describe()
from scipi import optimize optimize.minimize(f, x0=0) from scipi import integrate res, err = integrate.quad(f, 0, np.inf) from scipi import interpolate interpolate.interp1d(x, y, kind='quadratic', fill_value='extrapolate') from scipy import stats stats.norm.rvs(size=10) # normally distributed sample stats.t.rvs(10, size=100) stats.norm.pdf(x), cdf(x) # probability distribution, cumulative functions stats.describe(x) # calculate statistics for sample np.sqrt(stats.t.interval(confidence, len(squared_errors) - 1, loc=squared_errors.mean(), scale=stats.sem(squared_errors))) # calculate confidence interval for the test RMSE # this is equivalent to: tscore = stats.t.ppf((1 + confidence) / 2, df=m - 1) tmargin = tscore * squared_errors.std(ddof=1) / np.sqrt(m) np.sqrt(mean - tmargin), np.sqrt(mean + tmargin) fig = plt.figure() res = stats.probplot(train['SalePrice'], plot=plt) # helps to check if distribution is normal plt.show() ### STATSMODEL import statsmodels.api as sm