def dnnPower(shape1,shape2,lb=0,rb=1,NE=1000000): a=st.powerlaw(shape1) b=st.powerlaw(shape2) domain=n.linspace(lb,rb,NE) avals=a.cdf(domain) bvals=b.cdf(domain) diffP=n.abs(avals-bvals).max() return diffP
def cmp_scipy(self, g, s): x = np.linspace(-1, s + 1, 10 * (s + 2) + 1) q = np.linspace(-1, 2, 51) p0 = stats.powerlaw(g + 1, scale=s) v0 = p0.pdf(x) c0 = p0.cdf(x) x0 = p0.ppf(q) p1 = PowerLaw(g, 0, s) v1 = p1.pdf(x) c1 = p1.cdf(x) x1 = p1.ppf(q) self.assertIn(0, x) self.assertIn(s, x) np.testing.assert_allclose(v0, v1) np.testing.assert_allclose(c0, c1) np.testing.assert_allclose(x0, x1) rep = str(p1) self.assertEqual(rep[:9], "PowerLaw(") self.assertEqual(rep[-1:], ")") v = rep[9:-1].split(",") self.assertEqual(float(v[0]), g) self.assertEqual(float(v[1]), 0) self.assertEqual(float(v[2]), s)
def test_power_law_converter_inversion(power): """ Check that the power law inversion is invertible """ c = PowerLawConverter(power, scale=1) x = stats.powerlaw(power + 1).rvs(size=1000) y, _ = c.to_uniform_parameter(x) x_out, _ = c.from_uniform_parameter(y) np.testing.assert_array_almost_equal(x, x_out)
def test_power_law_converter_distribution(power): """ Check that the distribution of resulting samples is uniform when converting from a power law. """ c = PowerLawConverter(power, scale=1) x = stats.powerlaw(power + 1).rvs(size=10000) y, _ = c.to_uniform_parameter(x) d, p = stats.kstest(y, 'uniform') assert p >= 0.05
def generateToy(): np.random.seed(12345) fig,ax = plt.subplots(4,sharex=True) #fig,ax = plt.subplots(2) powerlaw_arg = 2 triang_arg=0.7 n_samples = 500 #generate simple line with slope 1, from 0 to 1 frozen_powerlaw = powerlaw(powerlaw_arg) #powerlaw.pdf(x, a) = a * x**(a-1) #generate triangle with peak at 0.7 frozen_triangle = triang(triang_arg) #up-sloping line from loc to (loc + c*scale) and then downsloping for (loc + c*scale) to (loc+scale). frozen_uniform = uniform(0.2,0.5) frozen_uniform2 = uniform(0.3,0.2) x = np.linspace(0,1) signal = np.random.normal(0.5, 0.1, n_samples/2) data_frame = pd.DataFrame({'powerlaw':powerlaw.rvs(powerlaw_arg,size=n_samples), 'triangle':triang.rvs(triang_arg,size=n_samples), 'uniform':np.concatenate((uniform.rvs(0.2,0.5,size=n_samples/2),uniform.rvs(0.3,0.2,size=n_samples/2))), 'powerlaw_signal':np.concatenate((powerlaw.rvs(powerlaw_arg,size=n_samples/2),signal))}) ax[0].plot(x, frozen_powerlaw.pdf(x), 'k-', lw=2, label='powerlaw pdf') hist(data_frame['powerlaw'],bins=100,normed=True,histtype='stepfilled',alpha=0.2,label='100 bins',ax=ax[0]) #hist(data_frame['powerlaw'],bins='blocks',fitness='poly_events',normed=True,histtype='stepfilled',alpha=0.2,label='b blocks',ax=ax[0]) ax[0].legend(loc = 'best') ax[1].plot(x, frozen_triangle.pdf(x), 'k-', lw=2, label='triangle pdf') hist(data_frame['triangle'],bins=100,normed=True,histtype='stepfilled',alpha=0.2,label='100 bins',ax=ax[1]) hist(data_frame['triangle'],bins='blocks',fitness='poly_events',normed=True,histtype='stepfilled',alpha=0.2,label='b blocks',ax=ax[1]) ax[1].legend(loc = 'best') #ax[0].plot(x, frozen_powerlaw.pdf(x), 'k-', lw=2, label='powerlaw pdf') hist(data_frame['powerlaw_signal'],bins=100,normed=True,histtype='stepfilled',alpha=0.2,label='100 bins',ax=ax[2]) #hist(data_frame['powerlaw_signal'],bins='blocks',normed=True,histtype='stepfilled',alpha=0.2,label='b blocks',ax=ax[2]) ax[2].legend(loc = 'best') ax[3].plot(x, frozen_uniform.pdf(x)+frozen_uniform2.pdf(x), 'k-', lw=2, label='uniform pdf') hist(data_frame['uniform'],bins=100,normed=True,histtype='stepfilled',alpha=0.2,label='100 bins',ax=ax[3]) #hist(data_frame['uniform'],bins='blocks',fitness = 'poly_events',p0=0.05,normed=True,histtype='stepfilled',alpha=0.2,label='b blocks',ax=ax[3]) ax[3].legend(loc = 'best') plt.show() fig.savefig('plots/toy_plots.png')
def main(a): #arg = raw_input("Zipf & Pareto belongs to power law. Please select: 1: Pareto\'s Law, 2: Zipf\'s law") ) fig, ax = plt.subplots(1, 1) mean, var, skew, kurt = powerlaw.stats(a, moments='mvsk') x = power_law_dist(a) ax.plot(x, powerlaw.pdf(x, a), 'r-', lw=5, alpha=0.6, label='powerlaw pdf') rv = powerlaw(a) ax.plot(x, rv.pdf(x), 'k-', lw=2, label='frozen pdf') r = powerlaw.rvs(a, size=1000) ax.legend(loc='best', frameon=False) #plt.plt(x,y) plt.show()
def set_distribution(name='norm', loc=0, scale=1, shape=1): distribution = { 'norm': (norm(loc=loc, scale=scale), r'Normal Distribution', r'$p(x)=\frac{\exp(-x^2/2)}{\sqrt{2\pi}}$' + r'loc = {:.2f}, scale = {:.2f}'.format(loc, scale)), 'expon': (expon(loc=loc, scale=scale), r'Exponential Distribution', r'$p(x)=\exp(-x)}$' + r'loc = {:.2f}, scale = {:.2f}'.format(loc, scale)), 'powerlaw': (powerlaw(a=shape + 1), r'Power Law Distribution', r'$p(x,\alpha)=\alpha x^{\alpha-1}$' + r', $\alpha-1 = {:.2f}$'.format(shape)), 'lognorm': (lognorm(s=shape), r'Lognormal Distribution', r'$p(x,s)=\frac{1}{sx\sqrt{2\pi}} exp(-\frac{\log^2(x)}{2s^2})$' + r', $shape = {:.2f}$'.format(shape)) } return distribution[name]
def dataset_write(fp, distriName, byts, tot, dis, pa1, pa2): if distriName == 'zipf' or distriName == 'powerlaw': props = powerlaw(dis, pa1) elif distriName == 'weibull': props = weibull(dis, pa1, pa2) # if not os.path.exists(fp): # os.mkdir(fp) freq = [round(prop * tot) + 1 for prop in props] dataset = gen(freq, byts) #print(len(dataset)) if distriName == 'zipf' or distriName == 'powerlaw': # fpath = fp + distriName + str(pa1) + '.dat' fpath = fp elif distriName == 'weibull': # fpath = fp + distriName + str(pa1) + '_' + str(pa2) + '.dat' fpath = fp with open(fpath, 'wb') as f: for d in dataset: f.write(d) return fpath
#count, bins, ignored = p.hist(n.random.weibull(5.,1000)) #x = n.arange(1,100.)/50. #scale = count.max()/weib(x, 1., 5.).max() W = weib(x, 1., 1.5) W_ = W / (W * step).sum() W__ = n.cumsum(W_) W2 = weib(x, 1., 1.7) W2_ = W2 / (W2 * step).sum() W2__ = n.cumsum(W2_) diffW = n.abs(W_ - W2_).max() #p.plot(x, W_) #p.plot(x, W2_) ##p.plot(x, weib(x, 1., 5.)*scale) #p.show() a = st.powerlaw(1.5) b = st.powerlaw(1.7) domain = n.linspace(0, 5.05, 10000) avals = a.cdf(domain) bvals = b.cdf(domain) diffP = n.abs(avals - bvals).max() print("distancias de KS para os modelos matematicos:", diffN, diffN2, diffU, diffU2, diffW, diffP) # distancias de KS para os modelos matematicos: # 0.0398776116762 0.0439947104098 0.0952338090952 0.047619047619 0.128565475845 0.0460149130584 # X = (-n.ln(U))^{1/a} lb, rb, NE, shape1, shape2 = 0, 10, 10000, 1.5, 1.7 x = n.linspace(lb, rb, NE) step = x[1] - x[0]
def generateToy(): np.random.seed(12345) fig, ax = plt.subplots(4, sharex=True) #fig,ax = plt.subplots(2) powerlaw_arg = 2 triang_arg = 0.7 n_samples = 500 #generate simple line with slope 1, from 0 to 1 frozen_powerlaw = powerlaw( powerlaw_arg) #powerlaw.pdf(x, a) = a * x**(a-1) #generate triangle with peak at 0.7 frozen_triangle = triang( triang_arg ) #up-sloping line from loc to (loc + c*scale) and then downsloping for (loc + c*scale) to (loc+scale). frozen_uniform = uniform(0.2, 0.5) frozen_uniform2 = uniform(0.3, 0.2) x = np.linspace(0, 1) signal = np.random.normal(0.5, 0.1, n_samples / 2) data_frame = pd.DataFrame({ 'powerlaw': powerlaw.rvs(powerlaw_arg, size=n_samples), 'triangle': triang.rvs(triang_arg, size=n_samples), 'uniform': np.concatenate((uniform.rvs(0.2, 0.5, size=n_samples / 2), uniform.rvs(0.3, 0.2, size=n_samples / 2))), 'powerlaw_signal': np.concatenate((powerlaw.rvs(powerlaw_arg, size=n_samples / 2), signal)) }) ax[0].plot(x, frozen_powerlaw.pdf(x), 'k-', lw=2, label='powerlaw pdf') hist(data_frame['powerlaw'], bins=100, normed=True, histtype='stepfilled', alpha=0.2, label='100 bins', ax=ax[0]) #hist(data_frame['powerlaw'],bins='blocks',fitness='poly_events',normed=True,histtype='stepfilled',alpha=0.2,label='b blocks',ax=ax[0]) ax[0].legend(loc='best') ax[1].plot(x, frozen_triangle.pdf(x), 'k-', lw=2, label='triangle pdf') hist(data_frame['triangle'], bins=100, normed=True, histtype='stepfilled', alpha=0.2, label='100 bins', ax=ax[1]) hist(data_frame['triangle'], bins='blocks', fitness='poly_events', normed=True, histtype='stepfilled', alpha=0.2, label='b blocks', ax=ax[1]) ax[1].legend(loc='best') #ax[0].plot(x, frozen_powerlaw.pdf(x), 'k-', lw=2, label='powerlaw pdf') hist(data_frame['powerlaw_signal'], bins=100, normed=True, histtype='stepfilled', alpha=0.2, label='100 bins', ax=ax[2]) #hist(data_frame['powerlaw_signal'],bins='blocks',normed=True,histtype='stepfilled',alpha=0.2,label='b blocks',ax=ax[2]) ax[2].legend(loc='best') ax[3].plot(x, frozen_uniform.pdf(x) + frozen_uniform2.pdf(x), 'k-', lw=2, label='uniform pdf') hist(data_frame['uniform'], bins=100, normed=True, histtype='stepfilled', alpha=0.2, label='100 bins', ax=ax[3]) #hist(data_frame['uniform'],bins='blocks',fitness = 'poly_events',p0=0.05,normed=True,histtype='stepfilled',alpha=0.2,label='b blocks',ax=ax[3]) ax[3].legend(loc='best') plt.show() fig.savefig('plots/toy_plots.png')
import collections as ct import scipy.stats as st income_model_dict = ct.OrderedDict() income_model_dict['johnsonsu'] = st.johnsonsu(-5.3839367311065747, 0.84376726932941271, -224.21280806585787, 79.661998696081355) income_model_dict['powerlaw'] = st.powerlaw(0.16342470577523971, -3.1423954341714262e-15, 55664716.096562646) income_model_dict['exponpow'] = st.exponpow(0.25441022752240294, -1.8475789041433829e-22, 36120900.670255348) income_model_dict['nakagami'] = st.nakagami(0.10038339454419823, -3.0390927147076284e-22, 33062195.426077582) income_model_dict['exponweib'] = st.exponweib(-3.5157658448986489, 0.44492833350419714, -15427.454196748848, 2440.0278856175246) drivingdistance_model_dict = ct.OrderedDict() drivingdistance_model_dict['nakagami'] = st.nakagami(0.11928581143831021, 14.999999999999996, 41.404620910360876) drivingdistance_model_dict['ncx2'] = st.ncx2(0.30254190304723211, 1.1286538320791935, 14.999999999999998, 8.7361471573932192) drivingdistance_model_dict['chi'] = st.chi(0.47882729877571095,
def generateToy(): np.random.seed(12345) fig,ax = plt.subplots() triang_arg=0.5 #frozen_triangle = triang(c=triang_arg, loc=2) #up-sloping line from loc to (loc + c*scale) and then downsloping for (loc + c*scale) to (loc+scale). frozen_triangle = triang(c=0.5,loc=2) #up-sloping line from loc to (loc + c*scale) and then downsloping for (loc + c*scale) to (loc+scale). frozen_powerlaw = powerlaw(2) #powerlaw.pdf(x, a) = a * x**(a-1) x = np.linspace(0,1,20) x2 = np.linspace(0,1,20) nx = x nx2 = x2 #nd = frozen_powerlaw.ppf(nx) #nd = np.array([0,0.3162,0.4472,0.5477,0.6324,0.7071,0.7746,0.8367,0.8944,0.9487]) nd = np.array([0,0.140175,0.264911,0.378405,0.48324,0.581139,0.67332,0.760682,0.843909,0.923538]) #nd = np.array([0.0723805,0.204159,0.322876,0.431782,0.532971,0.627882,0.717556,0.802776,0.884144,0.962142]) #pdf = frozen_powerlaw.pdf(x) #nd = frozen_triangle.ppf(nx) #print x #print nd #raw_input() #pdf = frozen_triangle.pdf(x) #print nd #print pdf #raw_input() #for i in range(len(nd)-1): # print (nd[i+1]-nd[i])*(nd[i+1]+nd[i]) #raw_input() #nd2 = frozen_triangle2.ppf(nx2) #pdf2 = frozen_triangle2.pdf(x2) #print nd,nd2 #ndc = np.concatenate((nd,nd2),axis=0) #print 'ndc', ndc #nxc = np.concatenate((nx,nx2)) #print pdf, pdf2 #pdfc = np.concatenate((pdf,pdf2)) #xc = np.concatenate((x,x2)) #plt.plot(nd,len(nx)*[1],"x") #plt.plot(x,pdf) #hist(nd,'blocks',fitness='poly_events',p0=0.05,histtype='bar',alpha=0.2,label='b blocks',ax=ax,normed=True) #plt.plot(nd[0:11],len(nx[0:11])*[1],"x") #plt.plot(x[0:11],pdf[0:11]) #hist(nd[0:11],'blocks',fitness='poly_events',p0=0.05,histtype='bar',alpha=0.2,label='b blocks',ax=ax,normed=True) #hist(ndc,bins=50,histtype='bar',alpha=0.2,label='b blocks',ax=ax,normed=True) #plt.plot(nd[11:],len(nx[11:])*[1],"x") #plt.plot(x[11:],pdf[11:]) #hist(nd[11:],'blocks',fitness='poly_events',p0=0.05,histtype='bar',alpha=0.2,label='b blocks',ax=ax,normed=True) print nd plt.plot(nd,len(nd)*[1],"x") #plt.plot(x,pdf) hist(nd,'blocks',fitness='poly_events',p0=0.05,histtype='bar',alpha=0.2,label='b blocks',ax=ax) plt.show() fig.savefig('plots/toy_plots2.png')
def __init__(self, a, loc, scale, capMaxImpact=False): super(PowerLawValue, self).__init__("Basic PowerLaw") # Set the powerlaw object now. It doesn't change between runs self.powerdistro = stats.powerlaw(a=a, loc=loc, scale=scale) self.capMaxImpact = capMaxImpact
dimension corresponding to the deviating role, i.e. the number of players in role i of dimension i should num_players[i] - 1. """ payoffs = np.empty(self.num_role_strats) for i, (gp, r) in enumerate(zip(self._gps, self.role_index)): payoffs[i] = gp.predict(profiles[r]).mean() return payoffs def min_payoffs(self): return self._min_payoffs.view() def max_payoffs(self): return self._max_payoffs.view() _CV_PARAMS = {'alpha': stats.powerlaw(.2, loc=1e-3, scale=50)} # XXX This changed in a scipy update and should be verified that its doing what # we want def _train_gp(x, y, **search_kwds): if 'n_jobs' in search_kwds and search_kwds['n_jobs'] < 1: # one job per cpu core search_kwds['n_jobs'] = multiprocessing.cpu_count() cv = model_selection.RandomizedSearchCV( gaussian_process.GaussianProcessRegressor(), _CV_PARAMS, error_score=-np.inf, **search_kwds) cv.fit(x, y) return cv.best_estimator_
def prepare_dataset(self, name='adult'): if name == 'adult': from utils.load_adult import get_train_test from utils.Custom_Dataset import Custom_Dataset import torch train_data, train_target, test_data, test_target = get_train_test() X_train = torch.tensor(train_data.values, requires_grad=False).float() y_train = torch.tensor(train_target.values, requires_grad=False).long() X_test = torch.tensor(test_data.values, requires_grad=False).float() y_test = torch.tensor(test_target.values, requires_grad=False).long() print("X train shape: ", X_train.shape) print("y train shape: ", y_train.shape) pos, neg =(y_train==1).sum().item() , (y_train==0).sum().item() print("Train set Positive counts: {}".format(pos),"Negative counts: {}.".format(neg), 'Split: {:.2%} - {:.2%}'.format(1. * pos/len(X_train), 1.*neg/len(X_train))) print("X test shape: ", X_test.shape) print("y test shape: ", y_test.shape) pos, neg =(y_test==1).sum().item() , (y_test==0).sum().item() print("Test set Positive counts: {}".format(pos),"Negative counts: {}.".format(neg), 'Split: {:.2%} - {:.2%}'.format(1. * pos/len(X_test), 1.*neg/len(X_test))) train_indices, valid_indices = get_train_valid_indices(len(X_train), self.train_val_split_ratio, self.sample_size_cap) train_set = Custom_Dataset(X_train[train_indices], y_train[train_indices], device=self.device) validation_set = Custom_Dataset(X_train[valid_indices], y_train[valid_indices], device=self.device) test_set = Custom_Dataset(X_test, y_test, device=self.device) return train_set, validation_set, test_set elif name == 'mnist': train = FastMNIST('datasets/MNIST', train=True, download=True) test = FastMNIST('datasets/MNIST', train=False, download=True) train_indices, valid_indices = get_train_valid_indices(len(train), self.train_val_split_ratio, self.sample_size_cap) from utils.Custom_Dataset import Custom_Dataset train_set = Custom_Dataset(train.data[train_indices], train.targets[train_indices], device=self.device) validation_set = Custom_Dataset(train.data[valid_indices],train.targets[valid_indices] , device=self.device) test_set = Custom_Dataset(test.data, test.targets, device=self.device) del train, test return train_set, validation_set, test_set elif name == 'cifar10': ''' from torchvision import transforms transform_train = transforms.Compose([ transforms.RandomCrop(32, padding=4), transforms.RandomHorizontalFlip(), transforms.ToTensor(), transforms.Normalize((0.4914, 0.4822, 0.4465), (0.2023, 0.1994, 0.2010)), ]) transform_test = transforms.Compose([ transforms.ToTensor(), transforms.Normalize((0.4914, 0.4822, 0.4465), (0.2023, 0.1994, 0.2010)), ]) ''' train = FastCIFAR10('datasets/cifar', train=True, download=True)#, transform=transform_train) test = FastCIFAR10('datasets/cifar', train=False, download=True)#, transform=transform_test) train_indices, valid_indices = get_train_valid_indices(len(train), self.train_val_split_ratio, self.sample_size_cap) from utils.Custom_Dataset import Custom_Dataset train_set = Custom_Dataset(train.data[train_indices], train.targets[train_indices], device=self.device) validation_set = Custom_Dataset(train.data[valid_indices],train.targets[valid_indices] , device=self.device) test_set = Custom_Dataset(test.data, test.targets, device=self.device) del train, test return train_set, validation_set, test_set elif name == "sst": import torchtext.data as data text_field = data.Field(lower=True) from torch import long as torch_long label_field = LabelField(dtype = torch_long, sequential=False) import torchtext.datasets as datasets train_data, validation_data, test_data = datasets.SST.splits(text_field, label_field, fine_grained=True) indices_list = powerlaw(list(range(len(train_data))), self.n_participants) ratios = [len(indices) / len(train_data) for indices in indices_list] train_datasets = split_torchtext_dataset_ratios(train_data, ratios) text_field.build_vocab(*(train_datasets + [validation_data, test_data])) label_field.build_vocab(*(train_datasets + [validation_data, test_data])) self.args.text_field = text_field self.args.label_field = label_field return train_datasets, validation_data, test_data elif name == 'mr': import torchtext.data as data from utils import mydatasets text_field = data.Field(lower=True) from torch import long as torch_long label_field = LabelField(dtype = torch_long, sequential=False) # label_field = data.Field(sequential=False) train_data, dev_data = mydatasets.MR.splits(text_field, label_field, root='.data/mr', shuffle=False) validation_data, test_data = dev_data.split(split_ratio=0.5, random_state = random.seed(1234)) indices_list = powerlaw(list(range(len(train_data))), self.n_participants) ratios = [len(indices) / len(train_data) for indices in indices_list] train_datasets = split_torchtext_dataset_ratios(train_data, ratios) # print(train_data, dir(train_data)) # print((train_datasets[0].examples[0].text)) # print((train_datasets[0].examples[1].text)) # print((train_datasets[0].examples[2].text)) # exit() text_field.build_vocab( *(train_datasets + [validation_data, test_data] )) label_field.build_vocab( *(train_datasets + [validation_data, test_data] )) self.args.text_field = text_field self.args.label_field = label_field return train_datasets, validation_data, test_data elif name == 'imdb': from torch import long as torch_long # text_field = Field(tokenize = 'spacy', preprocessing = generate_bigrams) # generate_bigrams takes about 2 minutes text_field = Field(tokenize = 'spacy') label_field = LabelField(dtype = torch_long) dirname = '.data/imdb/aclImdb' from torch.nn.init import normal_ from torchtext import datasets train_data, test_data = datasets.IMDB.splits(text_field, label_field) # 25000, 25000 samples each # use 5000 out of 25000 of test_data as the test_data test_data, remaining = test_data.split(split_ratio=0.2 ,random_state = random.seed(1234)) # use 5000 out of the remaining 2000 of test_data as valid data valid_data, remaining = remaining.split(split_ratio=0.25 ,random_state = random.seed(1234)) # train_data, valid_data = train_data.split(split_ratio=self.train_val_split_ratio ,random_state = random.seed(1234)) indices_list = powerlaw(list(range(len(train_data))), self.n_participants) ratios = [len(indices) / len(train_data) for indices in indices_list] train_datasets = split_torchtext_dataset_ratios(train_data, ratios) MAX_VOCAB_SIZE = 25_000 text_field.build_vocab(*(train_datasets + [valid_data, test_data] ), max_size = MAX_VOCAB_SIZE, vectors = "glove.6B.100d", unk_init = normal_) label_field.build_vocab( *(train_datasets + [valid_data, test_data] )) # INPUT_DIM = len(text_field.vocab) # OUTPUT_DIM = 1 # EMBEDDING_DIM = 100 PAD_IDX = text_field.vocab.stoi[text_field.pad_token] self.args.text_field = text_field self.args.label_field = label_field self.args.pad_idx = PAD_IDX return train_datasets, valid_data, test_data elif name == 'names': from utils.load_names import get_train_test from utils.Custom_Dataset import Custom_Dataset import torch from collections import Counter X_train, y_train, X_test, y_test, reference_dict = get_train_test() print("X train shape: ", X_train.shape) print("y train shape: ", y_train.shape) print("X test shape: ", X_test.shape) print("y test shape: ", y_test.shape) from utils.Custom_Dataset import Custom_Dataset train_set = Custom_Dataset(X_train, y_train) test_set = Custom_Dataset(X_test, y_test) return train_set, test_set
a = 1.66 mean, var, skew, kurt = powerlaw.stats(a, moments='mvsk') # Display the probability density function (``pdf``): x = np.linspace(powerlaw.ppf(0.01, a), powerlaw.ppf(0.99, a), 100) ax.plot(x, powerlaw.pdf(x, a), 'r-', lw=5, alpha=0.6, label='powerlaw pdf') # Alternatively, the distribution object can be called (as a function) # to fix the shape, location and scale parameters. This returns a "frozen" # RV object holding the given parameters fixed. # Freeze the distribution and display the frozen ``pdf``: rv = powerlaw(a) ax.plot(x, rv.pdf(x), 'k-', lw=2, label='frozen pdf') # Check accuracy of ``cdf`` and ``ppf``: vals = powerlaw.ppf([0.001, 0.5, 0.999], a) np.allclose([0.001, 0.5, 0.999], powerlaw.cdf(vals, a)) # True # Generate random numbers: r = powerlaw.rvs(a, size=1000) # And compare the histogram: ax.hist(r, normed=True, histtype='stepfilled', alpha=0.2)
def verify(self, pandas_series): return stats.kstest(pandas_series, powerlaw(self.alpha).cdf)
sim.SetParam('sink_radius',0.1) sim.SetParam('nbody','lfkdk') sim.SetParam('run_id','DISC1') sim.SetParam('tsnapfirst',0.0) sim.SetParam('dt_snap',6.28/20.) sim.SetParam('tend',6.28*10) sim.PreSetupForPython() #Now we start doing the actual work phi = np.random.uniform(0,2*np.pi,Ngas) #generation of r coordinate mr=powerlaw(0.5,loc=0.1,scale=10) r=mr.rvs(size=Ngas) #z coordinate, using first theta theta=np.zeros(Ngas) for i in range(Ngas): theta[i]=np.random.normal(0,scale=0.05*r[i]**0.5,size=1) z=theta*r #convert cylindrical/spherical coordinates to cartesian x=np.sin(phi)*r y=np.cos(phi)*r vphi=1./np.sqrt(r)#/np.sqrt(2.) vx=-np.cos(phi)*vphi vy=np.sin(phi)*vphi
def get_train_loaders(self, n_participants, split='powerlaw', batch_size=None): if not batch_size: batch_size = self.train_batch_size if split == 'classimbalance': if self.name not in ['mnist','cifar10']: raise NotImplementedError("Calling on dataset {}. Only mnist and cifar10 are implemnted for this split".format(self.name)) n_classes = 10 data_indices = [(self.train_dataset.targets == class_id).nonzero().view(-1).tolist() for class_id in range(n_classes)] class_sizes = np.linspace(1, n_classes, n_participants, dtype='int') print("class_sizes for each party", class_sizes) party_mean = self.sample_size_cap // self.n_participants from collections import defaultdict party_indices = defaultdict(list) for party_id, class_sz in enumerate(class_sizes): classes = range(class_sz) # can customize classes for each party rather than just listing each_class_id_size = party_mean // class_sz # print("party each class size:", party_id, each_class_id_size) for i, class_id in enumerate(classes): # randomly pick from each class a certain number of samples, with replacement selected_indices = random.choices(data_indices[class_id], k=each_class_id_size) # randomly pick from each class a certain number of samples, without replacement ''' NEED TO MAKE SURE THAT EACH CLASS HAS MORE THAN each_class_id_size for no replacement sampling selected_indices = random.sample(data_indices[class_id],k=each_class_id_size) ''' party_indices[party_id].extend(selected_indices) # top up to make sure all parties have the same number of samples if i == len(classes) - 1 and len(party_indices[party_id]) < party_mean: extra_needed = party_mean - len(party_indices[party_id]) party_indices[party_id].extend(data_indices[class_id][:extra_needed]) data_indices[class_id] = data_indices[class_id][extra_needed:] indices_list = [party_index_list for party_id, party_index_list in party_indices.items()] elif split == 'powerlaw': if self.name in ['sst', 'mr', 'imdb']: # sst, mr, imdb split is different from other datasets, so return here self.train_loaders = [BucketIterator(train_dataset, batch_size=self.train_batch_size, device=self.device, sort_key=lambda x: len(x.text),train=True) for train_dataset in self.train_datasets] self.shard_sizes = [(len(train_dataset)) for train_dataset in self.train_datasets] return self.train_loaders else: indices_list = powerlaw(list(range(len(self.train_dataset))), n_participants) elif split in ['balanced','equal']: from utils.utils import random_split indices_list = random_split(sample_indices=list(range(len(self.train_dataset))), m_bins=n_participants, equal=True) elif split == 'random': from utils.utils import random_split indices_list = random_split(sample_indices=list(range(len(self.train_dataset))), m_bins=n_participants, equal=False) # from collections import Counter # for indices in indices_list: # print(Counter(self.train_dataset.targets[indices].tolist())) self.shard_sizes = [len(indices) for indices in indices_list] participant_train_loaders = [DataLoader(self.train_dataset, batch_size=batch_size, sampler=SubsetRandomSampler(indices)) for indices in indices_list] return participant_train_loaders
from scipy.stats import powerlaw import matplotlib.pyplot as plt import numpy as np pl = powerlaw(.8, loc=0, scale=2) samples = pl.rvs(10000) # create random variables alpha, loc, scale = powerlaw.fit(samples) # fit the variables # plotting plt.figure(0) plt.clf() plt.hist(samples, bins=50, normed=True, histtype='stepfilled', alpha=.9) xmin, xmax = plt.xlim() x = np.linspace(xmin, xmax, 100) plt.plot(x, pl.pdf(x), linewidth=2, label="fit") plt.figure(1) plt.clf() plt.hist(samples, bins=50, normed=True, histtype='stepfilled', alpha=.9) xmin, xmax = plt.xlim() x = np.linspace(xmin, xmax, 100) plt.plot(x, pl.pdf(x), linewidth=2, label="fit") plt.xscale("log", basex=10, nonposy='clip') plt.yscale("log", basey=10, nonposy='clip') plt.show()
def all_dists(): # dists param were taken from scipy.stats official # documentaion examples # Total - 89 return { "alpha": stats.alpha(a=3.57, loc=0.0, scale=1.0), "anglit": stats.anglit(loc=0.0, scale=1.0), "arcsine": stats.arcsine(loc=0.0, scale=1.0), "beta": stats.beta(a=2.31, b=0.627, loc=0.0, scale=1.0), "betaprime": stats.betaprime(a=5, b=6, loc=0.0, scale=1.0), "bradford": stats.bradford(c=0.299, loc=0.0, scale=1.0), "burr": stats.burr(c=10.5, d=4.3, loc=0.0, scale=1.0), "cauchy": stats.cauchy(loc=0.0, scale=1.0), "chi": stats.chi(df=78, loc=0.0, scale=1.0), "chi2": stats.chi2(df=55, loc=0.0, scale=1.0), "cosine": stats.cosine(loc=0.0, scale=1.0), "dgamma": stats.dgamma(a=1.1, loc=0.0, scale=1.0), "dweibull": stats.dweibull(c=2.07, loc=0.0, scale=1.0), "erlang": stats.erlang(a=2, loc=0.0, scale=1.0), "expon": stats.expon(loc=0.0, scale=1.0), "exponnorm": stats.exponnorm(K=1.5, loc=0.0, scale=1.0), "exponweib": stats.exponweib(a=2.89, c=1.95, loc=0.0, scale=1.0), "exponpow": stats.exponpow(b=2.7, loc=0.0, scale=1.0), "f": stats.f(dfn=29, dfd=18, loc=0.0, scale=1.0), "fatiguelife": stats.fatiguelife(c=29, loc=0.0, scale=1.0), "fisk": stats.fisk(c=3.09, loc=0.0, scale=1.0), "foldcauchy": stats.foldcauchy(c=4.72, loc=0.0, scale=1.0), "foldnorm": stats.foldnorm(c=1.95, loc=0.0, scale=1.0), # "frechet_r": stats.frechet_r(c=1.89, loc=0.0, scale=1.0), # "frechet_l": stats.frechet_l(c=3.63, loc=0.0, scale=1.0), "genlogistic": stats.genlogistic(c=0.412, loc=0.0, scale=1.0), "genpareto": stats.genpareto(c=0.1, loc=0.0, scale=1.0), "gennorm": stats.gennorm(beta=1.3, loc=0.0, scale=1.0), "genexpon": stats.genexpon(a=9.13, b=16.2, c=3.28, loc=0.0, scale=1.0), "genextreme": stats.genextreme(c=-0.1, loc=0.0, scale=1.0), "gausshyper": stats.gausshyper(a=13.8, b=3.12, c=2.51, z=5.18, loc=0.0, scale=1.0), "gamma": stats.gamma(a=1.99, loc=0.0, scale=1.0), "gengamma": stats.gengamma(a=4.42, c=-3.12, loc=0.0, scale=1.0), "genhalflogistic": stats.genhalflogistic(c=0.773, loc=0.0, scale=1.0), "gilbrat": stats.gilbrat(loc=0.0, scale=1.0), "gompertz": stats.gompertz(c=0.947, loc=0.0, scale=1.0), "gumbel_r": stats.gumbel_r(loc=0.0, scale=1.0), "gumbel_l": stats.gumbel_l(loc=0.0, scale=1.0), "halfcauchy": stats.halfcauchy(loc=0.0, scale=1.0), "halflogistic": stats.halflogistic(loc=0.0, scale=1.0), "halfnorm": stats.halfnorm(loc=0.0, scale=1.0), "halfgennorm": stats.halfgennorm(beta=0.675, loc=0.0, scale=1.0), "hypsecant": stats.hypsecant(loc=0.0, scale=1.0), "invgamma": stats.invgamma(a=4.07, loc=0.0, scale=1.0), "invgauss": stats.invgauss(mu=0.145, loc=0.0, scale=1.0), "invweibull": stats.invweibull(c=10.6, loc=0.0, scale=1.0), "johnsonsb": stats.johnsonsb(a=4.32, b=3.18, loc=0.0, scale=1.0), "johnsonsu": stats.johnsonsu(a=2.55, b=2.25, loc=0.0, scale=1.0), "ksone": stats.ksone(n=1e03, loc=0.0, scale=1.0), "kstwobign": stats.kstwobign(loc=0.0, scale=1.0), "laplace": stats.laplace(loc=0.0, scale=1.0), "levy": stats.levy(loc=0.0, scale=1.0), "levy_l": stats.levy_l(loc=0.0, scale=1.0), "levy_stable": stats.levy_stable(alpha=0.357, beta=-0.675, loc=0.0, scale=1.0), "logistic": stats.logistic(loc=0.0, scale=1.0), "loggamma": stats.loggamma(c=0.414, loc=0.0, scale=1.0), "loglaplace": stats.loglaplace(c=3.25, loc=0.0, scale=1.0), "lognorm": stats.lognorm(s=0.954, loc=0.0, scale=1.0), "lomax": stats.lomax(c=1.88, loc=0.0, scale=1.0), "maxwell": stats.maxwell(loc=0.0, scale=1.0), "mielke": stats.mielke(k=10.4, s=3.6, loc=0.0, scale=1.0), "nakagami": stats.nakagami(nu=4.97, loc=0.0, scale=1.0), "ncx2": stats.ncx2(df=21, nc=1.06, loc=0.0, scale=1.0), "ncf": stats.ncf(dfn=27, dfd=27, nc=0.416, loc=0.0, scale=1.0), "nct": stats.nct(df=14, nc=0.24, loc=0.0, scale=1.0), "norm": stats.norm(loc=0.0, scale=1.0), "pareto": stats.pareto(b=2.62, loc=0.0, scale=1.0), "pearson3": stats.pearson3(skew=0.1, loc=0.0, scale=1.0), "powerlaw": stats.powerlaw(a=1.66, loc=0.0, scale=1.0), "powerlognorm": stats.powerlognorm(c=2.14, s=0.446, loc=0.0, scale=1.0), "powernorm": stats.powernorm(c=4.45, loc=0.0, scale=1.0), "rdist": stats.rdist(c=0.9, loc=0.0, scale=1.0), "reciprocal": stats.reciprocal(a=0.00623, b=1.01, loc=0.0, scale=1.0), "rayleigh": stats.rayleigh(loc=0.0, scale=1.0), "rice": stats.rice(b=0.775, loc=0.0, scale=1.0), "recipinvgauss": stats.recipinvgauss(mu=0.63, loc=0.0, scale=1.0), "semicircular": stats.semicircular(loc=0.0, scale=1.0), "t": stats.t(df=2.74, loc=0.0, scale=1.0), "triang": stats.triang(c=0.158, loc=0.0, scale=1.0), "truncexpon": stats.truncexpon(b=4.69, loc=0.0, scale=1.0), "truncnorm": stats.truncnorm(a=0.1, b=2, loc=0.0, scale=1.0), "tukeylambda": stats.tukeylambda(lam=3.13, loc=0.0, scale=1.0), "uniform": stats.uniform(loc=0.0, scale=1.0), "vonmises": stats.vonmises(kappa=3.99, loc=0.0, scale=1.0), "vonmises_line": stats.vonmises_line(kappa=3.99, loc=0.0, scale=1.0), "wald": stats.wald(loc=0.0, scale=1.0), "weibull_min": stats.weibull_min(c=1.79, loc=0.0, scale=1.0), "weibull_max": stats.weibull_max(c=2.87, loc=0.0, scale=1.0), "wrapcauchy": stats.wrapcauchy(c=0.0311, loc=0.0, scale=1.0), }
def main(args=None): from ligo.lw import lsctables from ligo.lw import utils as ligolw_utils from ligo.lw import ligolw import lal.series from scipy import stats p = parser() args = p.parse_args(args) xmldoc = ligolw.Document() xmlroot = xmldoc.appendChild(ligolw.LIGO_LW()) process = register_to_xmldoc(xmldoc, p, args) gwcosmo = GWCosmo( cosmology.default_cosmology.get_cosmology_from_string(args.cosmology)) ns_mass_min = 1.0 ns_mass_max = 2.0 bh_mass_min = 5.0 bh_mass_max = 50.0 ns_astro_spin_min = -0.05 ns_astro_spin_max = +0.05 ns_astro_mass_dist = stats.norm(1.33, 0.09) ns_astro_spin_dist = stats.uniform(ns_astro_spin_min, ns_astro_spin_max - ns_astro_spin_min) ns_broad_spin_min = -0.4 ns_broad_spin_max = +0.4 ns_broad_mass_dist = stats.uniform(ns_mass_min, ns_mass_max - ns_mass_min) ns_broad_spin_dist = stats.uniform(ns_broad_spin_min, ns_broad_spin_max - ns_broad_spin_min) bh_astro_spin_min = -0.99 bh_astro_spin_max = +0.99 bh_astro_mass_dist = stats.pareto(b=1.3) bh_astro_spin_dist = stats.uniform(bh_astro_spin_min, bh_astro_spin_max - bh_astro_spin_min) bh_broad_spin_min = -0.99 bh_broad_spin_max = +0.99 bh_broad_mass_dist = stats.reciprocal(bh_mass_min, bh_mass_max) bh_broad_spin_dist = stats.uniform(bh_broad_spin_min, bh_broad_spin_max - bh_broad_spin_min) if args.distribution.startswith('bns_'): m1_min = m2_min = ns_mass_min m1_max = m2_max = ns_mass_max if args.distribution.endswith('_astro'): x1_min = x2_min = ns_astro_spin_min x1_max = x2_max = ns_astro_spin_max m1_dist = m2_dist = ns_astro_mass_dist x1_dist = x2_dist = ns_astro_spin_dist elif args.distribution.endswith('_broad'): x1_min = x2_min = ns_broad_spin_min x1_max = x2_max = ns_broad_spin_max m1_dist = m2_dist = ns_broad_mass_dist x1_dist = x2_dist = ns_broad_spin_dist else: # pragma: no cover assert_not_reached() elif args.distribution.startswith('nsbh_'): m1_min = bh_mass_min m1_max = bh_mass_max m2_min = ns_mass_min m2_max = ns_mass_max if args.distribution.endswith('_astro'): x1_min = bh_astro_spin_min x1_max = bh_astro_spin_max x2_min = ns_astro_spin_min x2_max = ns_astro_spin_max m1_dist = bh_astro_mass_dist m2_dist = ns_astro_mass_dist x1_dist = bh_astro_spin_dist x2_dist = ns_astro_spin_dist elif args.distribution.endswith('_broad'): x1_min = bh_broad_spin_min x1_max = bh_broad_spin_max x2_min = ns_broad_spin_min x2_max = ns_broad_spin_max m1_dist = bh_broad_mass_dist m2_dist = ns_broad_mass_dist x1_dist = bh_broad_spin_dist x2_dist = ns_broad_spin_dist else: # pragma: no cover assert_not_reached() elif args.distribution.startswith('bbh_'): m1_min = m2_min = bh_mass_min m1_max = m2_max = bh_mass_max if args.distribution.endswith('_astro'): x1_min = x2_min = bh_astro_spin_min x1_max = x2_max = bh_astro_spin_max m1_dist = m2_dist = bh_astro_mass_dist x1_dist = x2_dist = bh_astro_spin_dist elif args.distribution.endswith('_broad'): x1_min = x2_min = bh_broad_spin_min x1_max = x2_max = bh_broad_spin_max m1_dist = m2_dist = bh_broad_mass_dist x1_dist = x2_dist = bh_broad_spin_dist else: # pragma: no cover assert_not_reached() else: # pragma: no cover assert_not_reached() dists = (m1_dist, m2_dist, x1_dist, x2_dist) # Read PSDs psds = list( lal.series.read_psd_xmldoc( ligolw_utils.load_fileobj( args.reference_psd, contenthandler=lal.series.PSDContentHandler)).values()) # Construct mass1, mass2, spin1z, spin2z grid. m1 = np.geomspace(m1_min, m1_max, 10) m2 = np.geomspace(m2_min, m2_max, 10) x1 = np.linspace(x1_min, x1_max, 10) x2 = np.linspace(x2_min, x2_max, 10) params = m1, m2, x1, x2 # Calculate the maximum distance on the grid. max_z = gwcosmo.get_max_z(psds, args.waveform, args.f_low, args.min_snr, m1, m2, x1, x2, jobs=args.jobs) if args.max_distance is not None: new_max_z = cosmology.z_at_value(gwcosmo.cosmo.luminosity_distance, args.max_distance * units.Mpc) max_z[max_z > new_max_z] = new_max_z max_distance = gwcosmo.sensitive_distance(max_z).to_value(units.Mpc) # Find piecewise constant approximate upper bound on distance. max_distance = cell_max(max_distance) # Calculate V * T in each grid cell cdfs = [dist.cdf(param) for param, dist in zip(params, dists)] cdf_los = [cdf[:-1] for cdf in cdfs] cdfs = [np.diff(cdf) for cdf in cdfs] probs = np.prod(np.meshgrid(*cdfs, indexing='ij'), axis=0) probs /= probs.sum() probs *= 4 / 3 * np.pi * max_distance**3 volume = probs.sum() probs /= volume probs = probs.ravel() volumetric_rate = args.nsamples / volume * units.year**-1 * units.Mpc**-3 # Draw random grid cells dist = stats.rv_discrete(values=(np.arange(len(probs)), probs)) indices = np.unravel_index(dist.rvs(size=args.nsamples), max_distance.shape) # Draw random intrinsic params from each cell cols = {} cols['mass1'], cols['mass2'], cols['spin1z'], cols['spin2z'] = [ dist.ppf(stats.uniform(cdf_lo[i], cdf[i]).rvs(size=args.nsamples)) for i, dist, cdf_lo, cdf in zip(indices, dists, cdf_los, cdfs) ] # Swap binary components as needed to ensure that mass1 >= mass2. # Note that the .copy() is important. # See https://github.com/numpy/numpy/issues/14428 swap = cols['mass1'] < cols['mass2'] cols['mass1'][swap], cols['mass2'][swap] = \ cols['mass2'][swap].copy(), cols['mass1'][swap].copy() cols['spin1z'][swap], cols['spin2z'][swap] = \ cols['spin2z'][swap].copy(), cols['spin1z'][swap].copy() # Draw random extrinsic parameters cols['distance'] = stats.powerlaw( a=3, scale=max_distance[indices]).rvs(size=args.nsamples) cols['longitude'] = stats.uniform(0, 2 * np.pi).rvs(size=args.nsamples) cols['latitude'] = np.arcsin(stats.uniform(-1, 2).rvs(size=args.nsamples)) cols['inclination'] = np.arccos( stats.uniform(-1, 2).rvs(size=args.nsamples)) cols['polarization'] = stats.uniform(0, 2 * np.pi).rvs(size=args.nsamples) cols['coa_phase'] = stats.uniform(-np.pi, 2 * np.pi).rvs(size=args.nsamples) cols['time_geocent'] = stats.uniform(1e9, units.year.to( units.second)).rvs(size=args.nsamples) # Convert from sensitive distance to redshift and comoving distance. # FIXME: Replace this brute-force lookup table with a solver. z = np.linspace(0, max_z.max(), 10000) ds = gwcosmo.sensitive_distance(z).to_value(units.Mpc) dc = gwcosmo.cosmo.comoving_distance(z).to_value(units.Mpc) z_for_ds = interp1d(ds, z, kind='cubic', assume_sorted=True) dc_for_ds = interp1d(ds, dc, kind='cubic', assume_sorted=True) zp1 = 1 + z_for_ds(cols['distance']) cols['distance'] = dc_for_ds(cols['distance']) # Apply redshift factor to convert from comoving distance and source frame # masses to luminosity distance and observer frame masses. for key in ['distance', 'mass1', 'mass2']: cols[key] *= zp1 # Populate sim_inspiral table sims = xmlroot.appendChild(lsctables.New(lsctables.SimInspiralTable)) for row in zip(*cols.values()): sims.appendRow(**dict(dict.fromkeys(sims.validcolumns, None), process_id=process.process_id, simulation_id=sims.get_next_id(), waveform=args.waveform, f_lower=args.f_low, **dict(zip(cols.keys(), row)))) # Record process end time. process.comment = str(volumetric_rate) process.set_end_time_now() # Write output file. write_fileobj(xmldoc, args.output)
def generateToy(): np.random.seed(12345) fig,ax = plt.subplots() triang_arg=0.5 #frozen_triangle = triang(c=triang_arg, loc=2) #up-sloping line from loc to (loc + c*scale) and then downsloping for (loc + c*scale) to (loc+scale). frozen_triangle = triang(c=0.5,loc=2) #up-sloping line from loc to (loc + c*scale) and then downsloping for (loc + c*scale) to (loc+scale). frozen_powerlaw = powerlaw(2) #powerlaw.pdf(x, a) = a * x**(a-1) x = np.linspace(0,1,20) x2 = np.linspace(0,1,20) nx = x nx2 = x2 #nd = frozen_powerlaw.ppf(nx) #nd = np.array([0,0.3162,0.4472,0.5477,0.6324,0.7071,0.7746,0.8367,0.8944,0.9487]) nd = np.array([0,0.140175,0.264911,0.378405,0.48324,0.581139,0.67332,0.760682,0.843909,0.923538]) #nd = np.array([0.0723805,0.204159,0.322876,0.431782,0.532971,0.627882,0.717556,0.802776,0.884144,0.962142]) #pdf = frozen_powerlaw.pdf(x) #nd = frozen_triangle.ppf(nx) #print x #print nd #raw_input() #pdf = frozen_triangle.pdf(x) #print nd #print pdf #raw_input() #for i in range(len(nd)-1): # print (nd[i+1]-nd[i])*(nd[i+1]+nd[i]) #raw_input() #nd2 = frozen_triangle2.ppf(nx2) #pdf2 = frozen_triangle2.pdf(x2) #print nd,nd2 #ndc = np.concatenate((nd,nd2),axis=0) #print 'ndc', ndc #nxc = np.concatenate((nx,nx2)) #print pdf, pdf2 #pdfc = np.concatenate((pdf,pdf2)) #xc = np.concatenate((x,x2)) #plt.plot(nd,len(nx)*[1],"x") #plt.plot(x,pdf) #hist(nd,'blocks',fitness='poly_events',p0=0.05,histtype='bar',alpha=0.2,label='b blocks',ax=ax,normed=True) #plt.plot(nd[0:11],len(nx[0:11])*[1],"x") #plt.plot(x[0:11],pdf[0:11]) #hist(nd[0:11],'blocks',fitness='poly_events',p0=0.05,histtype='bar',alpha=0.2,label='b blocks',ax=ax,normed=True) #hist(ndc,bins=50,histtype='bar',alpha=0.2,label='b blocks',ax=ax,normed=True) #plt.plot(nd[11:],len(nx[11:])*[1],"x") #plt.plot(x[11:],pdf[11:]) #hist(nd[11:],'blocks',fitness='poly_events',p0=0.05,histtype='bar',alpha=0.2,label='b blocks',ax=ax,normed=True) print(nd) plt.plot(nd,len(nd)*[1],"x") #plt.plot(x,pdf) hist(nd,'blocks',fitness='poly_events',p0=0.05,histtype='bar',alpha=0.2,label='b blocks',ax=ax) plt.show() fig.savefig('plots/toy_plots2.png')
def generate_dataset(distriName, para, tot, dis): import sys import random from scipy.stats import powerlaw import numpy as np import math bytePerStr = 4 filepath = "./dataset_temp/" + distriName + '_' + str(para) + '_' + str( tot) + '_' + str(dis) + ".dat" filenum = 1000 filesize = tot // filenum if (os.path.exists("./dataset_temp") == False): os.mkdir("./dataset_temp") os.mkdir("./dataset_temp/temp") def powerlaw(N, s): res = [] base = 0.0 for n in range(1, N + 1): t = 1 / (n**s) base += t res.append(t) return [r / base for r in res] def weibull(N, p, k): res = [] for n in range(0, N, 1): power1 = n**k p1 = (1 - p)**power1 power2 = (n + 1)**k p2 = (1 - p)**power2 res.append(p1 - p2) return res def gen_random_strings(len, byts): strs = set() res = [] for i in range(len): s = os.urandom(byts) while s in strs: s = os.urandom(byts) res.append(s) strs.add(s) return res def gen(freqs, byts): strs = gen_random_strings(len(freqs), byts) chs = [i for i in range(len(freqs))] for fileno in range(0, filenum - 1): temp_filepath = "./dataset_temp/temp/" + str(fileno) + ".dat" with open(temp_filepath, "ab") as f: for j in range(0, filesize): p = random.randint(0, len(chs) - 1) pos = chs[p] f.write(strs[pos]) if (freqs[pos] > 0): freqs[pos] -= 1 if freqs[pos] == 0: del chs[p] f.close() last_filesize = 0 fileno = filenum - 1 temp_filepath = "./dataset_temp/temp/" + str(fileno) + ".dat" with open(temp_filepath, "ab") as f: while len(chs) != 0: p = random.randint(0, len(chs) - 1) pos = chs[p] f.write(strs[pos]) last_filesize += 1 if (freqs[pos] > 0): freqs[pos] -= 1 if freqs[pos] == 0: del chs[p] def read_str(fp, sl, bytesNum): st = fp.read(bytesNum) while st: sl.append(st) st = fp.read(bytesNum) def read_shuffle_write(sl, fp1, fp2, bytesNum): with open(fp1, "rb") as f1: read_str(f1, sl, bytesNum) f1.close() with open(fp2, "rb") as f2: read_str(f2, sl, bytesNum) f2.close() random.shuffle(sl) with open(fp1, "wb") as f1: for j in range(filesize): f1.write(sl[j]) f1.close() with open(fp2, "wb") as f2: for j in range(filesize, len(sl)): f2.write(sl[j]) f2.close() def front_tail_shuffle(bytesNum): frontfile = [i for i in range(100, 200)] tailfile = [i for i in range(900, 1000)] random.shuffle(frontfile) random.shuffle(tailfile) for i in range(0, 100): str_list = [] frontfilepath = "./dataset_temp/temp/" + str(frontfile[i]) + ".dat" tailfilepath = "./dataset_temp/temp/" + str(tailfile[i]) + ".dat" read_shuffle_write(str_list, frontfilepath, tailfilepath, bytesNum) def whole_shuffle(bytesNum): file1 = random.randint(0, filenum - 1) file2 = random.randint(0, filenum - 1) while file2 == file1: file2 = random.randint(0, filenum - 1) str_list = [] filepath1 = "./dataset_temp/temp/" + str(file1) + ".dat" filepath2 = "./dataset_temp/temp/" + str(file2) + ".dat" read_shuffle_write(str_list, filepath1, filepath2, bytesNum) if distriName == 'zipf' or distriName == 'powerlaw': props = powerlaw(dis, para) elif distriName == 'weibull': pa = 0.1 props = weibull(dis, pa, para) freq = [math.ceil(prop * tot) for prop in props] minFreq = min(freq) maxFreq = max(freq) gen(freq, bytePerStr) front_tail_shuffle(bytePerStr) for i in range(100): whole_shuffle(bytePerStr) file_list = [i for i in range(filenum)] random.shuffle(file_list) with open(filepath, "wb") as f: for i in range(filenum): readfilepath = "./dataset_temp/temp/" + str(file_list[i]) + ".dat" with open(readfilepath, "rb") as rf: st = rf.read(bytePerStr) while st: f.write(st) st = rf.read(bytePerStr) rf.close() f.close() for fileno in range(0, filenum): temp_filepath = "./dataset_temp/temp/" + str(fileno) + ".dat" os.remove(temp_filepath) os.rmdir("./dataset_temp/temp") return (minFreq, maxFreq)
trial_odds_yes = .05 trial_odds_no = .95 # Odds of going to trial. regulation_odds_yes = .05 # Odds of having an audit requirement imposed. regulation_odds_no = .95 # This can be modeled further, to include several other costs we are leaving out. # Statistical Values settlements = loadtxt( 'settlements.dat') # Loading in external data for settlements fit = powerlaw.fit( settlements ) # Fitting data to a simulated Power Law, which we think is reasonable. incidents = powerlaw(a=fit[0], loc=fit[1], scale=fit[2]) c.progress("Disclosure Legal") # Disclosure complexity (Legal) # (Lawyers * Lawyer Rate * Hours) + (Engineers * Eng Pay * Hours) disclosure_lawyers = np.random.uniform( lawyers_min, lawyers_max, simulations) # What's the minimum? What's the maximum? disclosure_lawyer_rate = np.random.normal( lawyer_rate_average, lawyer_rate_variance, simulations ) # Using https://thervo.com/costs/attorney-fees as stand-in data disclosure_lawyer_hours = np.random.uniform( disclosure_lawyer_hours_min, disclosure_lawyer_hours_max, simulations) # At least a day per lawyer, as much as several weeks