Python remove_duplicatesの例、va_functions.remove_duplicates Pythonの例

コード例 #1

0

ファイルを表示

ファイル: simulate_with_assignments.py プロジェクト: esantorella/tva

def simulate(params, assignments, seed_increment):
    # unpack parameters
    var_mu, var_theta, var_delta, rho  = params['var mu'], params['var theta'] \
                                       , params['var delta'], params['ar1 param']

    np.random.seed(seed_increment)
    
    var_epsilon = 1 - var_theta - var_mu
    std_epsilon = var_epsilon**.5
    month_type = type(assignments['month_id'].values[0])
    
    
    assignments['delta'] = fill_effects(assignments['distcode'].values
                                      , var_delta**.5)
    
    assignments['mu'] = fill_effects(assignments['person'].values, var_mu**.5)
    postings = list(zip(assignments['distcode'].values
                       , assignments['person'].values))
    assignments['theta'] = fill_effects(postings, var_theta**.5)
    
    ## Create panel in which districts are always there
    # And use it to create serial correlation
    districts = remove_duplicates(assignments['distcode'].values)

    times = remove_duplicates(assignments['month_id'].values)
    T = len(times)
    D = len(districts)
    
    
    # Introduce serially correlated errors
    all_errors = np.empty((T, D))
    current_error = np.random.normal(0, std_epsilon, D)
    all_errors[0, :] = current_error
    
    for t in range(1, T):
        current_error = rho * current_error + np.random.normal(0, var_epsilon**.5, D)
        all_errors[t, :] = current_error


    balanced_panel = pd.DataFrame({'distcode': np.tile(districts, T),
                                   'month_id': np.array(times).repeat(D),
                                   'error': all_errors.flatten()})

    assignments = pd.merge(assignments, balanced_panel, how='left')

    assignments['outcome'] = assignments['mu'] + assignments['delta'] \
                         + assignments['theta'] + assignments['error']
    return assignments

コード例 #2

0

ファイルを表示

ファイル: simulate_with_assignments.py プロジェクト: protogeezer/tva

def simulate(params, assignments, seed_increment):
    # unpack parameters
    var_mu, var_theta, var_delta, rho  = params['var mu'], params['var theta'] \
                                       , params['var delta'], params['ar1 param']

    np.random.seed(seed_increment)

    var_epsilon = 1 - var_theta - var_mu
    std_epsilon = var_epsilon**.5
    month_type = type(assignments['month_id'].values[0])

    assignments['delta'] = fill_effects(assignments['distcode'].values,
                                        var_delta**.5)

    assignments['mu'] = fill_effects(assignments['person'].values, var_mu**.5)
    postings = list(
        zip(assignments['distcode'].values, assignments['person'].values))
    assignments['theta'] = fill_effects(postings, var_theta**.5)

    ## Create panel in which districts are always there
    # And use it to create serial correlation
    districts = remove_duplicates(assignments['distcode'].values)

    times = remove_duplicates(assignments['month_id'].values)
    T = len(times)
    D = len(districts)

    # Introduce serially correlated errors
    all_errors = np.empty((T, D))
    current_error = np.random.normal(0, std_epsilon, D)
    all_errors[0, :] = current_error

    for t in range(1, T):
        current_error = rho * current_error + np.random.normal(
            0, var_epsilon**.5, D)
        all_errors[t, :] = current_error

    balanced_panel = pd.DataFrame({
        'distcode': np.tile(districts, T),
        'month_id': np.array(times).repeat(D),
        'error': all_errors.flatten()
    })

    assignments = pd.merge(assignments, balanced_panel, how='left')

    assignments['outcome'] = assignments['mu'] + assignments['delta'] \
                         + assignments['theta'] + assignments['error']
    return assignments

コード例 #3

0

ファイルを表示

ファイル: simulate_with_assignments.py プロジェクト: esantorella/tva

def fill_effects(identifiers, st_dev):
    if st_dev == 0:
        return np.zeros(len(identifiers))

    no_dup_ids = remove_duplicates(identifiers)
    id_effect_dict = dict(zip(no_dup_ids
                        , np.random.normal(0, st_dev, len(no_dup_ids))))
    return [id_effect_dict[id_] for id_ in identifiers]

コード例 #4

0

ファイルを表示

ファイル: simulate_with_assignments.py プロジェクト: protogeezer/tva

def fill_effects(identifiers, st_dev):
    if st_dev == 0:
        return np.zeros(len(identifiers))

    no_dup_ids = remove_duplicates(identifiers)
    id_effect_dict = dict(
        zip(no_dup_ids, np.random.normal(0, st_dev, len(no_dup_ids))))
    return [id_effect_dict[id_] for id_ in identifiers]

コード例 #5

0

ファイルを表示

def reassign(state_df, time_var):
    times = sorted(set(state_df[time_var]))
    last_df = state_df[state_df[time_var] == np.min(times)]
    last_assignments_from_orig = dict(
        zip(last_df['distcode'], last_df['person']))
    last_assignments_from_sim = dict(
        zip(last_df['distcode'], last_df['person']))
    last_districts = remove_duplicates(last_df['distcode'])
    last_district_from_sim = dict(zip(last_df['person'], last_df['distcode']))

    text = Text(state_df['state'].values[0], last_assignments_from_orig)

    for t in times[1:]:
        indices = pd.Series(state_df[time_var] == t)
        current_df = state_df[indices]
        current_districts = remove_duplicates(current_df['distcode'])
        assert set(current_districts) == set(current_df['distcode'])
        assert len(current_districts) == len(current_df)
        current_assignments_from_orig = dict(
            zip(current_df['distcode'], current_df['person']))
        """
        Find people who are in the state in the current period and last period,
        AND are assigned to the same district in both period. 
        Find the districts they go with.
        """
        districts_with_continuing_people = \
                [dist for dist in current_districts
                        if dist in last_districts and
                       last_assignments_from_orig[dist] == current_assignments_from_orig[dist]]
        assert len(districts_with_continuing_people) == len(
            set(districts_with_continuing_people))
        assert set(districts_with_continuing_people).issubset(
            set(current_df['distcode']))
        people_continuing_in_district = [
            last_assignments_from_orig[dist]
            for dist in districts_with_continuing_people
        ]
        assert len(people_continuing_in_district) == len(
            set(people_continuing_in_district))
        assert set(people_continuing_in_district).issubset(
            set(current_df['person']))

        continuation_people_districts = [
            last_district_from_sim[p] for p in people_continuing_in_district
            if last_district_from_sim[p] in current_districts
        ]
        assert len(continuation_people_districts) == len(
            set(continuation_people_districts))
        assert set(continuation_people_districts).issubset(
            set(current_df['distcode']))

        other_people = [
            p for p in remove_duplicates(current_df['person'])
            if p not in people_continuing_in_district
        ]
        assert set(other_people) | set(people_continuing_in_district) == set(
            current_df['person'])
        """ 
        Create new assignments:
            - People who continue in the same district do so
            - Everyone else is randomly assigned to one of the other districts
        """
        np.random.shuffle(other_people)

        person_assignments = people_continuing_in_district + other_people
        assert set(person_assignments) == set(current_df['person'])
        other_districts = [
            d for d in current_districts
            if d not in continuation_people_districts
        ]
        assert len(other_districts) == len(set(other_districts))
        assert set(other_districts).issubset(set(current_districts))
        district_assignments = continuation_people_districts + other_districts
        assert set(district_assignments) == set(current_districts)
        assert len(district_assignments) == len(person_assignments)

        assignments = dict(zip(district_assignments, person_assignments))
        text.append('simulated assignments', assignments)
        # Update data with new changes
        state_df.loc[indices, 'person'] = state_df['distcode'].map(assignments)
        last_assignments_from_orig = current_assignments_from_orig.copy()
        last_district_from_sim = dict(
            zip(person_assignments, district_assignments))
        last_districts = current_districts.copy()

    return state_df

コード例 #6

0

ファイルを表示

ファイル: simulate_with_assignments_old.py プロジェクト: protogeezer/tva

def convert_vector_to_index_dict(vector):
    return {value:index for index, value in enumerate(remove_duplicates(vector))}

コード例 #7

0

ファイルを表示

ファイル: simulate_with_assignments_old.py プロジェクト: protogeezer/tva

def fill_effects(identifiers, st_dev):
    no_dup_ids = remove_duplicates(identifiers)
    id_effect_dict = dict(zip(no_dup_ids
                        , np.random.normal(0, st_dev, no_dup_ids)))
    return [id_effect_dict[id_] for id_ in identifiers]

コード例 #8

0

ファイルを表示

def reassign(state_df):
    times = sorted(set(state_df['month_id']))
    last_df = state_df[state_df['month_id'] == np.min(times)]
    assert times[0] == np.min(times)
    last_assignments_from_orig = dict(
        zip(last_df['distcode'], last_df['person']))
    last_assignments_from_sim = dict(
        zip(last_df['distcode'], last_df['person']))
    last_districts = remove_duplicates(last_df['distcode'])

    text = Text(state_df['state'].values[0], last_assignments_from_orig)
    state_df['sim_person'] = np.nan
    initial_idx = state_df['month_id'] == times[0]
    state_df.loc[initial_idx, 'sim_person'] = state_df.loc[initial_idx,
                                                           'person']

    for t in times[1:]:
        indices = pd.Series(state_df['month_id'] == t)
        current_df = state_df[indices]
        current_districts = remove_duplicates(current_df['distcode'])
        current_people = remove_duplicates(current_df['person'])
        assert set(current_districts) == set(current_df['distcode'])
        assert len(current_districts) == len(current_df)
        assert len(current_people) == len(current_df)
        current_assignments_from_orig = dict(
            zip(current_df['distcode'], current_df['person']))
        """
        Find people who are in the state in the current period and last period,
        AND are assigned to the same district in both period. 
        Find the districts they go with.
        """
        districts_with_continuing_people = \
            [dist for dist in current_districts
                if dist in last_districts and
                last_assignments_from_orig[dist] == current_assignments_from_orig[dist]]
        other_districts = [
            dist for dist in current_districts
            if dist not in districts_with_continuing_people
        ]
        continuing_people = [
            last_assignments_from_sim[d]
            for d in districts_with_continuing_people
        ]
        other_people = [
            p for p in current_people if p not in continuing_people
        ]
        np.random.shuffle(other_people)
        last_assignments_from_sim = dict(
            zip(districts_with_continuing_people + other_districts,
                continuing_people + other_people))

        text.append('simulated assignments', last_assignments_from_sim)
        # Update data with new changes
        state_df.loc[indices, 'sim_person'] = state_df['distcode'].map(
            last_assignments_from_sim)
        assert np.all(np.isfinite(state_df.loc[indices, 'sim_person']))
        last_assignments_from_orig = current_assignments_from_orig.copy()
        last_districts = current_districts.copy()

    state_df['sim_person'] = state_df['sim_person'].astype(int)
    return state_df