def make_propensity_based_simulated_labeler(treat_strength, con_strength, noise_level, base_propensity_scores, example_indices, exogeneous_con=0., setting="simple", seed=42): np.random.seed(seed) all_noise = random.normal(0, 1, base_propensity_scores.shape[0]).astype(np.float32) all_threshholds = np.array(random.uniform(0, 1, base_propensity_scores.shape[0]), dtype=np.float32) extra_confounding = random.normal(0, 1, base_propensity_scores.shape[0]).astype(np.float32) all_propensity_scores = expit((1.-exogeneous_con)*logit(base_propensity_scores) + exogeneous_con * extra_confounding).astype(np.float32) all_treatments = random.binomial(1, all_propensity_scores).astype(np.int32) # indices in dataset refer to locations in entire corpus, # but propensity scores will typically only inlcude a subset of the examples reindex_hack = np.zeros(12000, dtype=np.int32) reindex_hack[example_indices] = np.arange(example_indices.shape[0], dtype=np.int32) def labeler(data): index = data['index'] index_hack = tf.gather(reindex_hack, index) treatment = tf.gather(all_treatments, index_hack) confounding = 3.0 * (tf.gather(all_propensity_scores, index_hack) - 0.25) noise = tf.gather(all_noise, index_hack) y, y0, y1 = outcome_sim(treat_strength, con_strength, noise_level, tf.cast(treatment, tf.float32), confounding, noise, setting=setting) simulated_prob = tf.nn.sigmoid(y) y0 = tf.nn.sigmoid(y0) y1 = tf.nn.sigmoid(y1) threshold = tf.gather(all_threshholds, index) simulated_outcome = tf.cast(tf.greater(simulated_prob, threshold), tf.int32) return {**data, 'outcome': simulated_outcome, 'y0': y0, 'y1': y1, 'treatment': treatment} return labeler
def test_normal(self): rnd.seed(self.seed, self.brng) actual = rnd.normal(loc=.123456789, scale=2.0, size=(3, 2)) desired = np.array([[4.405778774782659, -4.020116569348963], [-1.732103577371005, 1.2282652034983546], [0.21648943171034918, 4.625591634211608]]) np.testing.assert_array_almost_equal(actual, desired, decimal=7) rnd.seed(self.seed, self.brng) actual = rnd.normal(loc=.123456789, scale=2.0, size=(3, 2), method="BoxMuller") desired = np.array([[0.16673479781277187, -3.4809986872165952], [-0.05193761082535492, 3.249201213154922], [-0.11915582299214138, 3.555636100927892]]) np.testing.assert_array_almost_equal(actual, desired, decimal=8) rnd.seed(self.seed, self.brng) actual = rnd.normal(loc=.123456789, scale=2.0, size=(3, 2), method="BoxMuller2") desired = np.array([[0.16673479781277187, 0.48153966449249175], [-3.4809986872165952, -0.8101190082826486], [-0.051937610825354905, 2.4088402362484342]]) np.testing.assert_array_almost_equal(actual, desired, decimal=7)
def make_propensity_based_simulated_labeler(treat_strength, con_strength, noise_level, base_propensity_scores, example_indices, exogeneous_con=0., setting="simple", seed=42): np.random.seed(seed) all_noise = random.normal(0, 1, base_propensity_scores.shape[0]).astype( np.float32) # extra_confounding = random.binomial(1, 0.5*np.ones_like(base_propensity_scores)).astype(np.float32) extra_confounding = random.normal( 0, 1, base_propensity_scores.shape[0]).astype(np.float32) all_propensity_scores = expit((1. - exogeneous_con) * logit(base_propensity_scores) + exogeneous_con * extra_confounding).astype( np.float32) all_treatments = random.binomial(1, all_propensity_scores).astype(np.int32) # indices in dataset refer to locations in entire corpus, # but propensity scores will typically only inlcude a subset of the examples reindex_hack = np.zeros(422206, dtype=np.int32) reindex_hack[example_indices] = np.arange(example_indices.shape[0], dtype=np.int32) def labeler(data): index = data['index'] index_hack = tf.gather(reindex_hack, index) treatment = tf.gather(all_treatments, index_hack) confounding = tf.gather(all_propensity_scores, index_hack) - 0.5 noise = tf.gather(all_noise, index_hack) simulated_score, y0, y1 = outcome_sim(treat_strength, con_strength, noise_level, tf.cast(treatment, tf.float32), confounding, noise, setting=setting) return { **data, 'outcome': simulated_score, 'y0': y0, 'y1': y1, 'treatment': treatment } return labeler
def make_buzzy_based_simulated_labeler(treat_strength, con_strength, noise_level, setting="simple", seed=0): # hardcode probability of theorem given buzzy / not_buzzy theorem_given_buzzy_probs = np.array([0.27, 0.07], dtype=np.float32) np.random.seed(seed) all_noise = np.array(random.normal(0, 1, 12000), dtype=np.float32) all_threshholds = np.array(random.uniform(0, 1, 12000), dtype=np.float32) def labeler(data): buzzy = data['buzzy_title'] index = data['index'] treatment = data['theorem_referenced'] treatment = tf.cast(treatment, tf.float32) confounding = 3.0*(tf.gather(theorem_given_buzzy_probs, buzzy) - 0.25) noise = tf.gather(all_noise, index) y, y0, y1 = outcome_sim(treat_strength, con_strength, noise_level, treatment, confounding, noise, setting=setting) simulated_prob = tf.nn.sigmoid(y) y0 = tf.nn.sigmoid(y0) y1 = tf.nn.sigmoid(y1) threshold = tf.gather(all_threshholds, index) simulated_outcome = tf.cast(tf.greater(simulated_prob, threshold), tf.int32) return {**data, 'outcome': simulated_outcome, 'y0': y0, 'y1': y1} return labeler
def __call__(self, sample): for channel in sample: noise = rnd.normal(loc=0.0, scale=self.sigma, size=channel['features'].shape).astype( np.float32) channel['features'] += noise return sample
def make_subreddit_based_simulated_labeler(treat_strength, con_strength, noise_level, setting="simple", seed=42): # hardcode gender proportions of each subreddit :'( gender_props = np.array([ 0.08290155440414508, 0.9306885544915641, 0.9444306623666584, 0.053265121877821245, 0.0836100211288862, 0.9018952928382787, 0.6491243280735217, 0.7985401459854015, 0.3436175847457627, 0.2293529255554572, 0.7604441360166551, 0.04929765886287625, 0.6117755289788408, 0.515695067264574, 0.24193122130091507, 0.06660675582809114, 0.5266344888108819, 0.875792872794372, 0.8210111788617886, 0.0022985674998973853 ], dtype=np.float32) np.random.seed(seed) all_noise = np.array(random.normal(0, 1, 422206), dtype=np.float32) def labeler(data): subreddit_idx = data['subreddit'] index = data['index'] treatment = data['gender'] treatment = tf.cast(treatment, tf.float32) confounding = tf.gather(gender_props, subreddit_idx) - 0.5 noise = tf.gather(all_noise, index) simulated_score, y0, y1 = outcome_sim(treat_strength, con_strength, noise_level, treatment, confounding, noise, setting=setting) return { **data, 'outcome': simulated_score, 'treatment': treatment, 'y0': y0, 'y1': y1 } # , 'confounding': confounding} return labeler
def generate(self, n): X = normal(size=(n, self.dimension)) / (self.dimension**0.5) Y = dot(X, self.v) if self.sigma != 0: Y += normal(scale=self.sigma, size=n) return (X, Y)
def __init__(self, dimension, sigma): self.dimension = dimension self.sigma = sigma self.v = normal(size=(self.dimension, ))