예제 #1
0
    def compute_measures_moments(data, jids):
        '''
        Arguments:
            data (Pandas DataFrame): data to use
            jids (list): sorted list of firm ids in data (since data could be a subset of full dataset, this is not necessarily all firms in self)

        Returns:
            moments (NumPy Array): NumPy array of firm moments
        '''
        n_firms = len(
            jids
        )  # Can't use data.n_firms() since data could be a subset of self
        n_measures = len(to_list(measures))
        moments = np.zeros([n_firms, n_measures])

        data.sort_values('j', inplace=True)  # Required for aggregate_transform

        for j, measure in enumerate(to_list(measures)):
            if measure == 'mean':
                # Group by mean income
                data['one'] = 1
                moments[:, j] = aggregate_transform(
                    data, 'j', 'y', 'sum', weights='row_weights',
                    merge=False) / aggregate_transform(data,
                                                       'j',
                                                       'one',
                                                       'sum',
                                                       weights='row_weights',
                                                       merge=False)
            elif measure == 'var':
                # Group by variance of income
                moments[:, j] = aggregate_transform(data,
                                                    'j',
                                                    'y',
                                                    'var',
                                                    weights='row_weights',
                                                    merge=False)
            elif measure == 'max':
                moments[:, j] = data.groupby('j')['y'].max().to_numpy()
            elif measure == 'min':
                moments[:, j] = data.groupby('j')['y'].min().to_numpy()

        return moments
    def get_cs(self):
        '''
        Return (collapsed) event study data reformatted into cross section data.

        Returns:
            data_cs (Pandas DataFrame): cross section data
        '''
        # Generate m column (the function checks if it already exists)
        self.gen_m()

        sdata = pd.DataFrame(self[self['m'] == 0])
        jdata = pd.DataFrame(self[self['m'] == 1])

        # Columns used for constructing cross section
        cs_cols = self._included_cols(flat=True)

        # Dictionary to swap names for cs=0 (these rows contain period-2 data for movers, so must swap columns for all relevant information to be contained in the same column (e.g. must move y2 into y1, otherwise bottom rows are just duplicates))
        rename_dict = {}
        for col in self._included_cols():
            subcols = bpd.to_list(self.reference_dict[col])
            n_subcols = len(subcols)
            # If even number of subcols, then is formatted as 'x1', 'x2', etc., so must swap to be 'x2', 'x1', etc.
            if n_subcols % 2 == 0:
                halfway = n_subcols // 2
                for i in range(halfway):
                    rename_dict[subcols[i]] = subcols[halfway + i]
                    rename_dict[subcols[halfway + i]] = subcols[i]

        # Combine the 2 data-sets
        data_cs = pd.concat([
            sdata[cs_cols].assign(cs=1), jdata[cs_cols].assign(cs=1),
            jdata[cs_cols].rename(rename_dict, axis=1).assign(cs=0)
        ],
                            ignore_index=True)

        # Sort columns
        sorted_cols = sorted(data_cs.columns, key=bpd.col_order)
        data_cs = data_cs[sorted_cols]

        self.logger.info(
            'mover and stayer event study datasets combined into cross section'
        )

        return data_cs
    def unstack_es(self, return_df=False):
        '''
        Unstack (collapsed) event study data by stacking (and renaming) period 2 data below period 1 data for movers, then dropping period 2 columns, returning a (collapsed) long dataframe. Duplicates created from unstacking are dropped.

        Returns:
            long_frame (BipartiteLong(Collapsed) or Pandas DataFrame): BipartiteLong(Collapsed) or Pandas dataframe generated from (collapsed) event study data
        '''
        # Generate m column (the function checks if it already exists)
        self.gen_m()

        # Dictionary to swap names (necessary for last row of data, where period-2 observations are not located in subsequent period-1 column (as it doesn't exist), so must append the last row with swapped column names)
        rename_dict_1 = {}
        # Dictionary to reformat names into (collapsed) long form
        rename_dict_2 = {}
        # For casting column types
        astype_dict = {}
        # Columns to drop
        drops = []
        for col in self._included_cols():
            subcols = bpd.to_list(self.reference_dict[col])
            n_subcols = len(subcols)
            # If even number of subcols, then is formatted as 'x1', 'x2', etc., so must swap to be 'x2', 'x1', etc.
            if n_subcols % 2 == 0:
                halfway = n_subcols // 2
                for i in range(halfway):
                    rename_dict_1[subcols[i]] = subcols[halfway + i]
                    rename_dict_1[subcols[halfway + i]] = subcols[i]
                    subcol_number = subcols[i].strip(
                        col)  # E.g. j1 will give 1
                    rename_dict_2[subcols[i]] = col + subcol_number[
                        1:]  # Get rid of first number, e.g. j12 to j2 (note there is no indexing issue even if subcol_number has only one digit)
                    if self.col_dtype_dict[col] == 'int':
                        astype_dict[rename_dict_2[subcols[i]]] = int

                    drops.append(subcols[halfway + i])

            else:
                # Check correct type for other columns
                if self.col_dtype_dict[col] == 'int':
                    astype_dict[col] = int

        # Sort by i, t if t included; otherwise sort by i
        sort_order = ['i']
        if self._col_included('t'):
            sort_order.append(bpd.to_list(self.reference_dict['t'])[0]
                              [:-1])  # Remove last number, e.g. t11 to t1

        # Stack period 2 data if a mover (this is because the last observation is only given as an f2i, never as an f1i)
        stacked_df = pd.DataFrame(self[self['m'] == 1]).rename(rename_dict_1,
                                                               axis=1)

        try:
            data_long = pd.concat([pd.DataFrame(self), stacked_df], ignore_index=True) \
                .drop(drops, axis=1) \
                .rename(rename_dict_2, axis=1) \
                .astype(astype_dict)
        except ValueError:  # If nan values, use Int8
            for col in astype_dict.keys():
                astype_dict[col] = 'Int64'
            data_long = pd.concat([pd.DataFrame(self), stacked_df], ignore_index=True) \
                .drop(drops, axis=1) \
                .rename(rename_dict_2, axis=1) \
                .astype(astype_dict)

        # Sort columns and rows
        sorted_cols = sorted(data_long.columns, key=bpd.col_order)
        data_long = data_long[sorted_cols].sort_values(sort_order).reset_index(
            drop=True)

        if return_df:
            return data_long

        long_frame = self._constructor_long(data_long)
        long_frame._set_attributes(self, no_dict=True)

        return long_frame
    def get_long(self, return_df=False):
        '''
        Return (collapsed) event study data reformatted into (collapsed) long form.

        Arguments:
            return_df (bool): if True, return a Pandas dataframe instead of a BipartiteLong(Collapsed) dataframe

        Returns:
            long_frame (BipartiteLong(Collapsed) or Pandas DataFrame): BipartiteLong(Collapsed) or Pandas dataframe generated from (collapsed) event study data
        '''
        # Generate m column (the function checks if it already exists)
        self.gen_m()

        # Dictionary to swap names (necessary for last row of data, where period-2 observations are not located in subsequent period-1 column (as it doesn't exist), so must append the last row with swapped column names)
        rename_dict_1 = {}
        # Dictionary to reformat names into (collapsed) long form
        rename_dict_2 = {}
        # For casting column types
        astype_dict = {}
        # Columns to drop
        drops = []
        for col in self._included_cols():
            subcols = bpd.to_list(self.reference_dict[col])
            n_subcols = len(subcols)
            # If even number of subcols, then is formatted as 'x1', 'x2', etc., so must swap to be 'x2', 'x1', etc.
            if n_subcols % 2 == 0:
                halfway = n_subcols // 2
                for i in range(halfway):
                    rename_dict_1[subcols[i]] = subcols[halfway + i]
                    rename_dict_1[subcols[halfway + i]] = subcols[i]
                    subcol_number = subcols[i].strip(
                        col)  # E.g. j1 will give 1
                    rename_dict_2[subcols[i]] = col + subcol_number[
                        1:]  # Get rid of first number, e.g. j12 to j2 (note there is no indexing issue even if subcol_number has only one digit)
                    if self.col_dtype_dict[col] == 'int':
                        astype_dict[rename_dict_2[subcols[i]]] = int

                    drops.append(subcols[halfway + i])

            else:
                # Check correct type for other columns
                if self.col_dtype_dict[col] == 'int':
                    astype_dict[col] = int

        # Sort by i, t if t included; otherwise sort by i
        sort_order_1 = ['i']
        sort_order_2 = ['i']
        if self._col_included('t'):
            sort_order_1.append(bpd.to_list(
                self.reference_dict['t'])[0])  # Pre-reformatting
            sort_order_2.append(bpd.to_list(self.reference_dict['t'])[0]
                                [:-1])  # Remove last number, e.g. t11 to t1

        # Append the last row if a mover (this is because the last observation is only given as an f2i, never as an f1i)
        last_obs_df = pd.DataFrame(self[self['m'] == 1]) \
            .sort_values(sort_order_1) \
            .drop_duplicates(subset='i', keep='last') \
            .rename(rename_dict_1, axis=1) # Sort by i, t to ensure last observation is actually last

        try:
            data_long = pd.concat([pd.DataFrame(self), last_obs_df], ignore_index=True) \
                .drop(drops, axis=1) \
                .rename(rename_dict_2, axis=1) \
                .astype(astype_dict)
        except ValueError:  # If nan values, use Int8
            for col in astype_dict.keys():
                astype_dict[col] = 'Int64'
            data_long = pd.concat([pd.DataFrame(self), last_obs_df], ignore_index=True) \
                .drop(drops, axis=1) \
                .rename(rename_dict_2, axis=1) \
                .astype(astype_dict)
        # data_long = pd.DataFrame(self).groupby('i').apply(lambda a: a.append(a.iloc[-1].rename(rename_dict_1, axis=1)) if a.iloc[0]['m'] == 1 else a) \
        #     .reset_index(drop=True) \
        #     .drop(drops, axis=1) \
        #     .rename(rename_dict_2, axis=1) \
        #     .astype(astype_dict)

        # Sort columns and rows
        sorted_cols = sorted(data_long.columns, key=bpd.col_order)
        data_long = data_long[sorted_cols].sort_values(
            sort_order_2).reset_index(drop=True)

        if return_df:
            return data_long

        long_frame = self._constructor_long(data_long)
        long_frame._set_attributes(self, no_dict=True)

        return long_frame
예제 #5
0
    def get_es(self):
        '''
        Return (collapsed) long form data reformatted into (collapsed) event study data.

        Returns:
            es_frame (BipartiteEventStudy(Collapsed)): BipartiteEventStudy(Collapsed) object generated from (collapsed) long data
        '''
        # Generate m column (the function checks if it already exists)
        self.gen_m()

        # Split workers by movers and stayers
        stayers = pd.DataFrame(self[self['m'] == 0])
        movers = pd.DataFrame(self[self['m'] == 1])
        self.logger.info('workers split by movers and stayers')

        # Add lagged values
        all_cols = self._included_cols()
        movers = movers.sort_values(
            ['i', bpd.to_list(self.reference_dict['t'])[0]])  # Sort by i, t
        keep_cols = ['i']  # Columns to keep
        for col in all_cols:
            for subcol in bpd.to_list(self.reference_dict[col]):
                subcol_number = subcol.strip(col)  # E.g. j1 will give 1
                if subcol != 'm':  # Don't want lagged m
                    # Movers
                    plus_1 = col + '1' + subcol_number  # Useful for t1 and t2: t1 should go to t11 and t21; t2 should go to t12 and t22
                    plus_2 = col + '2' + subcol_number
                    movers[plus_1] = movers[subcol].shift(
                        periods=1)  # Lagged value
                    movers.rename({subcol: plus_2}, axis=1, inplace=True)
                    # Stayers (no lags)
                    stayers[plus_1] = stayers[subcol]
                    stayers.rename({subcol: plus_2}, axis=1, inplace=True)
                    if subcol != 'i':  # Columns to keep
                        keep_cols += [plus_1, plus_2]
                else:
                    keep_cols.append('m')

        movers = movers[movers['i1'] == movers[
            'i2']]  # Ensure lagged values are for the same worker

        # Correct datatypes (shifting adds nans which converts all columns into float, correct columns that should be int)
        for col in all_cols:
            if (self.col_dtype_dict[col] == 'int') and (col != 'm'):
                for subcol in bpd.to_list(self.reference_dict[col]):
                    subcol_number = subcol.strip(col)  # E.g. j1 will give 1
                    movers[col + '1' +
                           subcol_number] = movers[col + '1' +
                                                   subcol_number].astype(int)

        # Correct i
        movers.drop('i1', axis=1, inplace=True)
        movers.rename({'i2': 'i'}, axis=1, inplace=True)
        stayers.drop('i2', axis=1, inplace=True)
        stayers.rename({'i1': 'i'}, axis=1, inplace=True)

        # Keep only relevant columns
        stayers = stayers[keep_cols]
        movers = movers[keep_cols]
        self.logger.info('columns updated')

        # Merge stayers and movers
        data_es = pd.concat([stayers, movers]).reset_index(drop=True)

        # Sort columns
        sorted_cols = sorted(data_es.columns, key=bpd.col_order)
        data_es = data_es[sorted_cols]

        self.logger.info('data reformatted as event study')

        es_frame = self._constructor_es(data_es)
        es_frame._set_attributes(self, no_dict=True)

        return es_frame
예제 #6
0
    def get_collapsed_long(self, copy=True):
        '''
        Collapse long data by job spells (so each spell for a particular worker at a particular firm is one observation).

        Arguments:
            copy (bool): if False, avoid copy

        Returns:
            collapsed_frame (BipartiteLongCollapsed): BipartiteLongCollapsed object generated from long data collapsed by job spells
        '''
        # Generate m column (the function checks if it already exists)
        self.gen_m()

        # Convert to Pandas dataframe
        data = pd.DataFrame(self, copy=copy)
        # Sort data by i and t
        data = data.sort_values(['i', 't'])
        self.logger.info('copied data sorted by i and t')

        # Introduce lagged i and j
        data['i_l1'] = data['i'].shift(periods=1)
        data['j_l1'] = data['j'].shift(periods=1)
        self.logger.info('lagged i and j introduced')

        # Generate spell ids
        # Source: https://stackoverflow.com/questions/59778744/pandas-grouping-and-aggregating-consecutive-rows-with-same-value-in-column
        new_spell = (data['j'] != data['j_l1']) | (
            data['i'] != data['i_l1']
        )  # Allow for i != i_l1 to ensure that consecutive workers at the same firm get counted as different spells
        data['spell_id'] = new_spell.cumsum()
        self.logger.info('spell ids generated')

        # Aggregate at the spell level
        spell = data.groupby(['spell_id'])
        # First, aggregate required columns
        data_spell = spell.agg(i=pd.NamedAgg(column='i', aggfunc='first'),
                               j=pd.NamedAgg(column='j', aggfunc='first'),
                               y=pd.NamedAgg(column='y', aggfunc='mean'),
                               t1=pd.NamedAgg(column='t', aggfunc='min'),
                               t2=pd.NamedAgg(column='t', aggfunc='max'),
                               w=pd.NamedAgg(column='i', aggfunc='size'))
        # Next, aggregate optional columns
        all_cols = self._included_cols()
        for col in all_cols:
            if col in self.columns_opt:
                if self.col_dtype_dict[col] == 'int':
                    for subcol in bpd.to_list(self.reference_dict[col]):
                        data_spell[subcol] = spell[subcol].first()
                if self.col_dtype_dict[col] == 'float':
                    for subcol in bpd.to_list(self.reference_dict[col]):
                        data_spell[subcol] = spell[subcol].mean()

        # # Classify movers and stayers
        # if not self._col_included('m'):
        #     spell_count = data_spell.groupby(['i']).transform('count')['j'] # Choice of j arbitrary
        #     data_spell['m'] = (spell_count > 1).astype(int)
        collapsed_data = data_spell.reset_index(drop=True)

        # Sort columns
        sorted_cols = sorted(collapsed_data.columns, key=bpd.col_order)
        collapsed_data = collapsed_data[sorted_cols]

        self.logger.info('data aggregated at the spell level')

        collapsed_frame = bpd.BipartiteLongCollapsed(collapsed_data)
        collapsed_frame._set_attributes(self, no_dict=True)

        return collapsed_frame
예제 #7
0
    def get_es_extended(self,
                        periods_pre=3,
                        periods_post=3,
                        stable_pre=[],
                        stable_post=[],
                        include=['g', 'y'],
                        transition_col='j',
                        copy=True):
        '''
        Return Pandas dataframe of event study with periods_pre periods before the transition (the transition is defined by a switch in the transition column) and periods_post periods after the transition, where transition fulcrums are given by job moves, and the first post-period is given by the job move. Returned dataframe gives worker id, period of transition, income over all periods, and firm cluster over all periods. The function will run .cluster() if no g column exists.

        Arguments:
            periods_pre (int): number of periods before the transition
            periods_post (int): number of periods after the transition
            stable_pre (column name or list of column names): for each column, keep only workers who have constant values in that column before the transition
            stable_post (column name or list of column names): for each column, keep only workers who have constant values in that column after the transition
            include (column name or list of column names): columns to include data for all periods
            transition_col (str): column to use to define a transition
            copy (bool): if False, avoid copy

        Returns:
            es_extended_frame or None (Pandas DataFrame or None): extended event study generated from long data if clustered; None if not clustered
        '''
        # Convert into lists
        include = bpd.to_list(include)
        stable_pre = bpd.to_list(stable_pre)
        stable_post = bpd.to_list(stable_post)

        # Get list of all columns (note that stable_pre and stable_post can have columns that are not in include)
        all_cols = include[:]
        for col in set(stable_pre + stable_post):
            if col not in all_cols:
                all_cols.append(col)

        # Check that columns exist
        for col in all_cols:
            if not self._col_included(col):
                return None

        # Create return frame
        es_extended_frame = pd.DataFrame(self, copy=copy)

        # Generate how many periods each worker worked
        es_extended_frame['one'] = 1
        es_extended_frame['worker_total_periods'] = es_extended_frame.groupby(
            'i')['one'].transform(
                sum
            )  # Must faster to use .transform(sum) than to use .transform(len)

        # Keep workers with enough periods (must have at least periods_pre + periods_post periods)
        es_extended_frame = es_extended_frame[
            es_extended_frame['worker_total_periods'] >= periods_pre +
            periods_post].reset_index(drop=True)

        # Sort by worker-period
        es_extended_frame.sort_values(['i', 't'], inplace=True)

        # For each worker-period, generate (how many total years - 1) they have worked at that point (e.g. if a worker started in 2005, and had data each year, then 2008 would give 3, 2009 would give 4, etc.)
        es_extended_frame['worker_periods_worked'] = es_extended_frame.groupby(
            'i')['one'].cumsum() - 1
        es_extended_frame.drop('one', axis=1, inplace=True)

        # Find periods where the worker transitioned, which can serve as fulcrums for the event study
        es_extended_frame['moved_firms'] = (
            (es_extended_frame['i'] == es_extended_frame['i'].shift(periods=1))
            & (es_extended_frame[transition_col] !=
               es_extended_frame[transition_col].shift(periods=1))).astype(int)

        # Compute valid moves - periods where the worker transitioned, and they also have periods_pre periods before the move, and periods_post periods after (and including) the move
        es_extended_frame['valid_move'] = \
                                es_extended_frame['moved_firms'] & \
                                (es_extended_frame['worker_periods_worked'] >= periods_pre) & \
                                ((es_extended_frame['worker_total_periods'] - es_extended_frame['worker_periods_worked']) >= periods_post)

        # Drop irrelevant columns
        es_extended_frame.drop(
            ['worker_total_periods', 'worker_periods_worked', 'moved_firms'],
            axis=1,
            inplace=True)

        # Only keep workers who have a valid move
        es_extended_frame = es_extended_frame[
            es_extended_frame.groupby('i')['valid_move'].transform(max) > 0]

        # Compute lags and leads
        column_order = [[] for _ in range(len(include))]  # For column order
        for i, col in enumerate(all_cols):
            # Compute lagged values
            for j in range(1, periods_pre + 1):
                es_extended_frame['{}_l{}'.format(
                    col, j)] = es_extended_frame[col].shift(periods=j)
                if col in include:
                    column_order[i].insert(0, '{}_l{}'.format(col, j))
            # Compute lead values
            for j in range(
                    periods_post
            ):  # No + 1 because base period has no shift (e.g. y becomes y_f1)
                if j > 0:  # No shift necessary for base period because already exists
                    es_extended_frame['{}_f{}'.format(
                        col, j + 1)] = es_extended_frame[col].shift(periods=-j)
                if col in include:
                    column_order[i].append('{}_f{}'.format(col, j + 1))

        valid_rows = ~pd.isna(
            es_extended_frame[col]
        )  # Demarcate valid rows (all should start off True)
        # Stable pre-trend
        for col in stable_pre:
            for i in range(2, periods_pre + 1):  # Shift 1 is baseline
                valid_rows = (valid_rows) & (es_extended_frame[col].shift(
                    periods=1) == es_extended_frame[col].shift(periods=i))

        # Stable post-trend
        for col in stable_post:
            for i in range(1, periods_post):  # Shift 0 is baseline
                valid_rows = (valid_rows) & (es_extended_frame[col]
                                             == es_extended_frame[col].shift(
                                                 periods=-i))

        # Update with pre- and/or post-trend
        es_extended_frame = es_extended_frame[valid_rows]

        # Rename base period to have _f1 (e.g. y becomes y_f1)
        es_extended_frame.rename({col: col + '_f1'
                                  for col in include},
                                 axis=1,
                                 inplace=True)

        # Keep rows with valid moves
        es_extended_frame = es_extended_frame[es_extended_frame['valid_move']
                                              == 1].reset_index(drop=True)

        # Drop irrelevant columns
        es_extended_frame.drop('valid_move', axis=1, inplace=True)

        # Correct datatypes
        for i, col in enumerate(include):
            es_extended_frame[column_order[i]] = es_extended_frame[
                column_order[i]].astype(self.col_dtype_dict[col])

        col_order = []
        for order in column_order:
            col_order += order

        # Reorder columns
        es_extended_frame = es_extended_frame[['i', 't'] + col_order]

        # Return es_extended_frame
        return es_extended_frame