def simulate(params, assignments, seed_increment): # unpack parameters var_mu, var_theta, var_delta, rho = params['var mu'], params['var theta'] \ , params['var delta'], params['ar1 param'] np.random.seed(seed_increment) var_epsilon = 1 - var_theta - var_mu std_epsilon = var_epsilon**.5 month_type = type(assignments['month_id'].values[0]) assignments['delta'] = fill_effects(assignments['distcode'].values , var_delta**.5) assignments['mu'] = fill_effects(assignments['person'].values, var_mu**.5) postings = list(zip(assignments['distcode'].values , assignments['person'].values)) assignments['theta'] = fill_effects(postings, var_theta**.5) ## Create panel in which districts are always there # And use it to create serial correlation districts = remove_duplicates(assignments['distcode'].values) times = remove_duplicates(assignments['month_id'].values) T = len(times) D = len(districts) # Introduce serially correlated errors all_errors = np.empty((T, D)) current_error = np.random.normal(0, std_epsilon, D) all_errors[0, :] = current_error for t in range(1, T): current_error = rho * current_error + np.random.normal(0, var_epsilon**.5, D) all_errors[t, :] = current_error balanced_panel = pd.DataFrame({'distcode': np.tile(districts, T), 'month_id': np.array(times).repeat(D), 'error': all_errors.flatten()}) assignments = pd.merge(assignments, balanced_panel, how='left') assignments['outcome'] = assignments['mu'] + assignments['delta'] \ + assignments['theta'] + assignments['error'] return assignments
def simulate(params, assignments, seed_increment): # unpack parameters var_mu, var_theta, var_delta, rho = params['var mu'], params['var theta'] \ , params['var delta'], params['ar1 param'] np.random.seed(seed_increment) var_epsilon = 1 - var_theta - var_mu std_epsilon = var_epsilon**.5 month_type = type(assignments['month_id'].values[0]) assignments['delta'] = fill_effects(assignments['distcode'].values, var_delta**.5) assignments['mu'] = fill_effects(assignments['person'].values, var_mu**.5) postings = list( zip(assignments['distcode'].values, assignments['person'].values)) assignments['theta'] = fill_effects(postings, var_theta**.5) ## Create panel in which districts are always there # And use it to create serial correlation districts = remove_duplicates(assignments['distcode'].values) times = remove_duplicates(assignments['month_id'].values) T = len(times) D = len(districts) # Introduce serially correlated errors all_errors = np.empty((T, D)) current_error = np.random.normal(0, std_epsilon, D) all_errors[0, :] = current_error for t in range(1, T): current_error = rho * current_error + np.random.normal( 0, var_epsilon**.5, D) all_errors[t, :] = current_error balanced_panel = pd.DataFrame({ 'distcode': np.tile(districts, T), 'month_id': np.array(times).repeat(D), 'error': all_errors.flatten() }) assignments = pd.merge(assignments, balanced_panel, how='left') assignments['outcome'] = assignments['mu'] + assignments['delta'] \ + assignments['theta'] + assignments['error'] return assignments
def fill_effects(identifiers, st_dev): if st_dev == 0: return np.zeros(len(identifiers)) no_dup_ids = remove_duplicates(identifiers) id_effect_dict = dict(zip(no_dup_ids , np.random.normal(0, st_dev, len(no_dup_ids)))) return [id_effect_dict[id_] for id_ in identifiers]
def fill_effects(identifiers, st_dev): if st_dev == 0: return np.zeros(len(identifiers)) no_dup_ids = remove_duplicates(identifiers) id_effect_dict = dict( zip(no_dup_ids, np.random.normal(0, st_dev, len(no_dup_ids)))) return [id_effect_dict[id_] for id_ in identifiers]
def reassign(state_df, time_var): times = sorted(set(state_df[time_var])) last_df = state_df[state_df[time_var] == np.min(times)] last_assignments_from_orig = dict( zip(last_df['distcode'], last_df['person'])) last_assignments_from_sim = dict( zip(last_df['distcode'], last_df['person'])) last_districts = remove_duplicates(last_df['distcode']) last_district_from_sim = dict(zip(last_df['person'], last_df['distcode'])) text = Text(state_df['state'].values[0], last_assignments_from_orig) for t in times[1:]: indices = pd.Series(state_df[time_var] == t) current_df = state_df[indices] current_districts = remove_duplicates(current_df['distcode']) assert set(current_districts) == set(current_df['distcode']) assert len(current_districts) == len(current_df) current_assignments_from_orig = dict( zip(current_df['distcode'], current_df['person'])) """ Find people who are in the state in the current period and last period, AND are assigned to the same district in both period. Find the districts they go with. """ districts_with_continuing_people = \ [dist for dist in current_districts if dist in last_districts and last_assignments_from_orig[dist] == current_assignments_from_orig[dist]] assert len(districts_with_continuing_people) == len( set(districts_with_continuing_people)) assert set(districts_with_continuing_people).issubset( set(current_df['distcode'])) people_continuing_in_district = [ last_assignments_from_orig[dist] for dist in districts_with_continuing_people ] assert len(people_continuing_in_district) == len( set(people_continuing_in_district)) assert set(people_continuing_in_district).issubset( set(current_df['person'])) continuation_people_districts = [ last_district_from_sim[p] for p in people_continuing_in_district if last_district_from_sim[p] in current_districts ] assert len(continuation_people_districts) == len( set(continuation_people_districts)) assert set(continuation_people_districts).issubset( set(current_df['distcode'])) other_people = [ p for p in remove_duplicates(current_df['person']) if p not in people_continuing_in_district ] assert set(other_people) | set(people_continuing_in_district) == set( current_df['person']) """ Create new assignments: - People who continue in the same district do so - Everyone else is randomly assigned to one of the other districts """ np.random.shuffle(other_people) person_assignments = people_continuing_in_district + other_people assert set(person_assignments) == set(current_df['person']) other_districts = [ d for d in current_districts if d not in continuation_people_districts ] assert len(other_districts) == len(set(other_districts)) assert set(other_districts).issubset(set(current_districts)) district_assignments = continuation_people_districts + other_districts assert set(district_assignments) == set(current_districts) assert len(district_assignments) == len(person_assignments) assignments = dict(zip(district_assignments, person_assignments)) text.append('simulated assignments', assignments) # Update data with new changes state_df.loc[indices, 'person'] = state_df['distcode'].map(assignments) last_assignments_from_orig = current_assignments_from_orig.copy() last_district_from_sim = dict( zip(person_assignments, district_assignments)) last_districts = current_districts.copy() return state_df
def convert_vector_to_index_dict(vector): return {value:index for index, value in enumerate(remove_duplicates(vector))}
def fill_effects(identifiers, st_dev): no_dup_ids = remove_duplicates(identifiers) id_effect_dict = dict(zip(no_dup_ids , np.random.normal(0, st_dev, no_dup_ids))) return [id_effect_dict[id_] for id_ in identifiers]
def reassign(state_df): times = sorted(set(state_df['month_id'])) last_df = state_df[state_df['month_id'] == np.min(times)] assert times[0] == np.min(times) last_assignments_from_orig = dict( zip(last_df['distcode'], last_df['person'])) last_assignments_from_sim = dict( zip(last_df['distcode'], last_df['person'])) last_districts = remove_duplicates(last_df['distcode']) text = Text(state_df['state'].values[0], last_assignments_from_orig) state_df['sim_person'] = np.nan initial_idx = state_df['month_id'] == times[0] state_df.loc[initial_idx, 'sim_person'] = state_df.loc[initial_idx, 'person'] for t in times[1:]: indices = pd.Series(state_df['month_id'] == t) current_df = state_df[indices] current_districts = remove_duplicates(current_df['distcode']) current_people = remove_duplicates(current_df['person']) assert set(current_districts) == set(current_df['distcode']) assert len(current_districts) == len(current_df) assert len(current_people) == len(current_df) current_assignments_from_orig = dict( zip(current_df['distcode'], current_df['person'])) """ Find people who are in the state in the current period and last period, AND are assigned to the same district in both period. Find the districts they go with. """ districts_with_continuing_people = \ [dist for dist in current_districts if dist in last_districts and last_assignments_from_orig[dist] == current_assignments_from_orig[dist]] other_districts = [ dist for dist in current_districts if dist not in districts_with_continuing_people ] continuing_people = [ last_assignments_from_sim[d] for d in districts_with_continuing_people ] other_people = [ p for p in current_people if p not in continuing_people ] np.random.shuffle(other_people) last_assignments_from_sim = dict( zip(districts_with_continuing_people + other_districts, continuing_people + other_people)) text.append('simulated assignments', last_assignments_from_sim) # Update data with new changes state_df.loc[indices, 'sim_person'] = state_df['distcode'].map( last_assignments_from_sim) assert np.all(np.isfinite(state_df.loc[indices, 'sim_person'])) last_assignments_from_orig = current_assignments_from_orig.copy() last_districts = current_districts.copy() state_df['sim_person'] = state_df['sim_person'].astype(int) return state_df