Ejemplo n.º 1
0
def make_propensity_based_simulated_labeler(treat_strength, con_strength, noise_level,
                                            base_propensity_scores, example_indices, exogeneous_con=0.,
                                            setting="simple", seed=42):
    np.random.seed(seed)
    all_noise = random.normal(0, 1, base_propensity_scores.shape[0]).astype(np.float32)
    all_threshholds = np.array(random.uniform(0, 1, base_propensity_scores.shape[0]), dtype=np.float32)

    extra_confounding = random.normal(0, 1, base_propensity_scores.shape[0]).astype(np.float32)

    all_propensity_scores = expit((1.-exogeneous_con)*logit(base_propensity_scores) + exogeneous_con * extra_confounding).astype(np.float32)
    all_treatments = random.binomial(1, all_propensity_scores).astype(np.int32)

    # indices in dataset refer to locations in entire corpus,
    # but propensity scores will typically only inlcude a subset of the examples
    reindex_hack = np.zeros(12000, dtype=np.int32)
    reindex_hack[example_indices] = np.arange(example_indices.shape[0], dtype=np.int32)

    def labeler(data):
        index = data['index']
        index_hack = tf.gather(reindex_hack, index)
        treatment = tf.gather(all_treatments, index_hack)
        confounding = 3.0 * (tf.gather(all_propensity_scores, index_hack) - 0.25)
        noise = tf.gather(all_noise, index_hack)

        y, y0, y1 = outcome_sim(treat_strength, con_strength, noise_level, tf.cast(treatment, tf.float32), confounding, noise, setting=setting)
        simulated_prob = tf.nn.sigmoid(y)
        y0 = tf.nn.sigmoid(y0)
        y1 = tf.nn.sigmoid(y1)
        threshold = tf.gather(all_threshholds, index)
        simulated_outcome = tf.cast(tf.greater(simulated_prob, threshold), tf.int32)

        return {**data, 'outcome': simulated_outcome, 'y0': y0, 'y1': y1, 'treatment': treatment}

    return labeler
Ejemplo n.º 2
0
    def test_normal(self):
        rnd.seed(self.seed, self.brng)
        actual = rnd.normal(loc=.123456789, scale=2.0, size=(3, 2))
        desired = np.array([[4.405778774782659, -4.020116569348963],
                            [-1.732103577371005, 1.2282652034983546],
                            [0.21648943171034918, 4.625591634211608]])
        np.testing.assert_array_almost_equal(actual, desired, decimal=7)

        rnd.seed(self.seed, self.brng)
        actual = rnd.normal(loc=.123456789,
                            scale=2.0,
                            size=(3, 2),
                            method="BoxMuller")
        desired = np.array([[0.16673479781277187, -3.4809986872165952],
                            [-0.05193761082535492, 3.249201213154922],
                            [-0.11915582299214138, 3.555636100927892]])
        np.testing.assert_array_almost_equal(actual, desired, decimal=8)

        rnd.seed(self.seed, self.brng)
        actual = rnd.normal(loc=.123456789,
                            scale=2.0,
                            size=(3, 2),
                            method="BoxMuller2")
        desired = np.array([[0.16673479781277187, 0.48153966449249175],
                            [-3.4809986872165952, -0.8101190082826486],
                            [-0.051937610825354905, 2.4088402362484342]])
        np.testing.assert_array_almost_equal(actual, desired, decimal=7)
Ejemplo n.º 3
0
def make_propensity_based_simulated_labeler(treat_strength,
                                            con_strength,
                                            noise_level,
                                            base_propensity_scores,
                                            example_indices,
                                            exogeneous_con=0.,
                                            setting="simple",
                                            seed=42):
    np.random.seed(seed)
    all_noise = random.normal(0, 1, base_propensity_scores.shape[0]).astype(
        np.float32)
    # extra_confounding = random.binomial(1, 0.5*np.ones_like(base_propensity_scores)).astype(np.float32)
    extra_confounding = random.normal(
        0, 1, base_propensity_scores.shape[0]).astype(np.float32)

    all_propensity_scores = expit((1. - exogeneous_con) *
                                  logit(base_propensity_scores) +
                                  exogeneous_con * extra_confounding).astype(
                                      np.float32)
    all_treatments = random.binomial(1, all_propensity_scores).astype(np.int32)

    # indices in dataset refer to locations in entire corpus,
    # but propensity scores will typically only inlcude a subset of the examples
    reindex_hack = np.zeros(422206, dtype=np.int32)
    reindex_hack[example_indices] = np.arange(example_indices.shape[0],
                                              dtype=np.int32)

    def labeler(data):
        index = data['index']
        index_hack = tf.gather(reindex_hack, index)
        treatment = tf.gather(all_treatments, index_hack)
        confounding = tf.gather(all_propensity_scores, index_hack) - 0.5
        noise = tf.gather(all_noise, index_hack)

        simulated_score, y0, y1 = outcome_sim(treat_strength,
                                              con_strength,
                                              noise_level,
                                              tf.cast(treatment, tf.float32),
                                              confounding,
                                              noise,
                                              setting=setting)

        return {
            **data, 'outcome': simulated_score,
            'y0': y0,
            'y1': y1,
            'treatment': treatment
        }

    return labeler
Ejemplo n.º 4
0
def make_buzzy_based_simulated_labeler(treat_strength, con_strength, noise_level, setting="simple", seed=0):
    # hardcode probability of theorem given buzzy / not_buzzy
    theorem_given_buzzy_probs = np.array([0.27, 0.07], dtype=np.float32)

    np.random.seed(seed)
    all_noise = np.array(random.normal(0, 1, 12000), dtype=np.float32)
    all_threshholds = np.array(random.uniform(0, 1, 12000), dtype=np.float32)

    def labeler(data):
        buzzy = data['buzzy_title']
        index = data['index']
        treatment = data['theorem_referenced']
        treatment = tf.cast(treatment, tf.float32)
        confounding = 3.0*(tf.gather(theorem_given_buzzy_probs, buzzy) - 0.25)

        noise = tf.gather(all_noise, index)

        y, y0, y1 = outcome_sim(treat_strength, con_strength, noise_level, treatment, confounding, noise, setting=setting)
        simulated_prob = tf.nn.sigmoid(y)
        y0 = tf.nn.sigmoid(y0)
        y1 = tf.nn.sigmoid(y1)
        threshold = tf.gather(all_threshholds, index)
        simulated_outcome = tf.cast(tf.greater(simulated_prob, threshold), tf.int32)

        return {**data, 'outcome': simulated_outcome, 'y0': y0, 'y1': y1}

    return labeler
Ejemplo n.º 5
0
 def __call__(self, sample):
     for channel in sample:
         noise = rnd.normal(loc=0.0,
                            scale=self.sigma,
                            size=channel['features'].shape).astype(
                                np.float32)
         channel['features'] += noise
     return sample
Ejemplo n.º 6
0
def make_subreddit_based_simulated_labeler(treat_strength,
                                           con_strength,
                                           noise_level,
                                           setting="simple",
                                           seed=42):
    # hardcode gender proportions of each subreddit :'(
    gender_props = np.array([
        0.08290155440414508, 0.9306885544915641, 0.9444306623666584,
        0.053265121877821245, 0.0836100211288862, 0.9018952928382787,
        0.6491243280735217, 0.7985401459854015, 0.3436175847457627,
        0.2293529255554572, 0.7604441360166551, 0.04929765886287625,
        0.6117755289788408, 0.515695067264574, 0.24193122130091507,
        0.06660675582809114, 0.5266344888108819, 0.875792872794372,
        0.8210111788617886, 0.0022985674998973853
    ],
                            dtype=np.float32)

    np.random.seed(seed)
    all_noise = np.array(random.normal(0, 1, 422206), dtype=np.float32)

    def labeler(data):
        subreddit_idx = data['subreddit']
        index = data['index']
        treatment = data['gender']
        treatment = tf.cast(treatment, tf.float32)
        confounding = tf.gather(gender_props, subreddit_idx) - 0.5
        noise = tf.gather(all_noise, index)

        simulated_score, y0, y1 = outcome_sim(treat_strength,
                                              con_strength,
                                              noise_level,
                                              treatment,
                                              confounding,
                                              noise,
                                              setting=setting)

        return {
            **data, 'outcome': simulated_score,
            'treatment': treatment,
            'y0': y0,
            'y1': y1
        }
        # , 'confounding': confounding}

    return labeler
Ejemplo n.º 7
0
 def generate(self, n):
     X = normal(size=(n, self.dimension)) / (self.dimension**0.5)
     Y = dot(X, self.v)
     if self.sigma != 0:
         Y += normal(scale=self.sigma, size=n)
     return (X, Y)
Ejemplo n.º 8
0
 def __init__(self, dimension, sigma):
     self.dimension = dimension
     self.sigma = sigma
     self.v = normal(size=(self.dimension, ))