Esempio n. 1
0
def predict_abc(interp,
                extrap,
                interp_index,
                extrap_index,
                weight,
                interp_weights,
                extrap_weights,
                cs,
                abc,
                verbose=True):

    # set up age range
    ages = range(22, 30) + range(31, 68)

    # set up dictionaries to store output
    params_interp = {}
    params_extrap = {}
    error_mat = {}

    # set up matrices for interpolation/extrapolation parameters, and errors
    for sex in ['pooled', 'male', 'female']:
        params_interp[sex] = pd.DataFrame(
            [[np.nan for j in range(len(cols.interp.predictors) + 3)]
             for k in range(22, 30)],
            index=range(22, 30))
        params_interp[sex].index.names = ['age']
        params_interp[sex].columns = ['Intercept'] + cols.interp.predictors + [
            'y'
        ] + ['rmse']

        params_extrap[sex] = pd.DataFrame(
            [[np.nan for j in range(len(cols.extrap.predictors) + 3)]
             for k in range(31, 68)],
            index=range(31, 68))
        params_extrap[sex].index.names = ['age']
        params_extrap[sex].columns = ['Intercept'] + cols.extrap.predictors + [
            'y'
        ] + ['rmse']
        error_mat[sex] = pd.DataFrame([])

    # obtain parameters for every age
    for age in ages:

        if age in range(22, 30):
            aux = deepcopy(interp.loc[interp_index, :])
            if age == 22:

                interp_weights.reset_index(inplace=True)
                del interp_weights['draw']
                interp_weights.set_index('id', inplace=True, drop=True)
                weight_array = deepcopy(
                    interp_weights.loc[pd.IndexSlice[interp_index], :])

            age_x = age - 1
            predictors = cols.interp.predictors + ['inc_labor{}'.format(age_x)]

        elif age in range(31, 68):
            aux = deepcopy(extrap.loc[extrap_index, :])
            if age == 31:
                age_x = 29
                predictors = cols.extrap.predictors + [
                    'inc_labor{}'.format(age_x)
                ]

            else:
                age_x = age - 1
                predictors = cols.extrap.predictors + [
                    'inc_labor{}'.format(age_x)
                ]

            if age == 31:
                extrap_index_weight = [x[1] for x in extrap_index]

                extrap_weights.reset_index(inplace=True)
                del extrap_weights['draw']
                extrap_weights.set_index('id', inplace=True, drop=True)
                weight_array = deepcopy(
                    extrap_weights.loc[extrap_index_weight, :])

        c = 'inc_labor{}'.format(age)

        # drop black
        # drop black
        aux = aux.loc[aux.black == 1]

        # obtain parameters for different sexes
        for sex in ['pooled', 'male', 'female']:

            if sex == 'pooled':
                data = aux
                abcd = abc
                abcd_count = abcd.shape[0]

            elif sex == 'male':
                data = aux.loc[aux.male == 1]
                abcd = abc.loc[abc.male == 1]
                abcd_count = abcd.loc[abcd['male'] == 1]['male'].count()

            else:
                data = aux.loc[aux.male == 0]
                abcd = abc.loc[abc.male == 0]
                abcd_count = abcd.loc[abcd['male'] == 0]['male'].count()

            if weight == 'treat':
                abcd = abcd.loc[abcd.R == 1]
            elif weight == 'control':
                abcd = abcd.loc[abcd.R == 0]

            # reset auxiliary index (because dmatrices won't use id)
            data.reset_index('id', drop=True, inplace=True)
            data.index = [j for j in range(data.shape[0])]

            weight_array.reset_index('id', drop=True, inplace=True)
            weight_array.index = [j for j in range(weight_array.shape[0])]

            #weight_array = weight_array[data.index]

            # create design matrix for regressions
            fmla = '{} ~ {}'.format(c, ' + '.join(predictors))
            endog, exog = dmatrices(fmla, data, return_type='dataframe')
            exog = sm.add_constant(exog)
            exog_index = [x for x in exog.index]
            weight_forWLS = weight_array.loc[pd.IndexSlice[exog_index]]
            weight_type = 'wtabc_allids_c' + cs + '_' + weight
            weight_forWLS = weight_forWLS.loc[:, weight_type]
            weight_forWLS.dropna(axis=0, inplace=True)

            exog = exog.loc[weight_forWLS.index, :]
            endog = endog.loc[weight_forWLS.index, :]
            # estimate coefficients
            fail_switch = 0
            try:
                model = sm.WLS(endog, exog, weights=weight_forWLS)
                fit = model.fit()
                params = fit.params
                resid = fit.resid
            except:
                fail_switch = 1
                if age in range(22, 30):
                    params = pd.Series(
                        [np.nan for j in range(1 + len(predictors))],
                        index=['Intercept'] + cols.interp.predictors + ['y'])
                else:
                    params = pd.Series(
                        [np.nan for j in range(1 + len(predictors))],
                        index=['Intercept'] + cols.extrap.predictors + ['y'])
                resid = pd.Series([np.nan for j in range(endog.shape[0])])

            # calculate RMSE
            rmse = resid * resid
            rmse = pd.Series(sqrt(rmse.mean(axis=0)), index=['rmse'])
            params = pd.concat([params, rmse], axis=0)
            params.rename({'inc_labor{}'.format(age_x): 'y'}, inplace=True)
            if age in range(22, 30):
                params_interp[sex].loc[age, :] = params
            else:
                params_extrap[sex].loc[age, :] = params
            # resample the errors, and merge in with ABC IDs
            if fail_switch == 0:
                ehat = pd.DataFrame(np.random.choice(resid, size=abcd_count))
            else:
                ehat = pd.DataFrame([np.nan for j in range(abcd_count)])
            abcd_ix = abcd.reset_index(level=0)
            ehat = pd.concat([abcd_ix.loc[:, 'id'], ehat], axis=1)
            ehat.columns = ['id', age]
            ehat.columns.name = 'age'
            ehat.set_index('id', inplace=True)
            error_mat[sex] = pd.concat([error_mat[sex], ehat], axis=1)

        if verbose:
            print 'Successful predictions, age {}, n={}'.format(
                age, exog.shape[0])

# add treatment indicator back into error matrix, add column names
    treat = abc.loc[:, 'R']
    for sex in ['pooled', 'male', 'female']:
        error_mat[sex] = pd.concat([error_mat[sex], treat],
                                   axis=1,
                                   join='inner')
        params_interp[sex].columns.name = 'variable'
        params_extrap[sex].columns.name = 'variable'

    male_interp_nix = abcd.loc[abcd.male == 1].loc[pd.isnull(
        abcd.loc[abcd.male == 1, cols.interp.predictors]).any(axis=1)].index
    female_interp_nix = abcd.loc[abcd.male == 0].loc[pd.isnull(
        abcd.loc[abcd.male == 0, cols.interp.predictors]).any(axis=1)].index

    male_extrap_nix = abcd.loc[abcd.male == 1].loc[pd.isnull(
        abcd.loc[abcd.male == 1, cols.extrap.predictors]).any(axis=1)].index
    female_extrap_nix = abcd.loc[abcd.male == 0].loc[pd.isnull(
        abcd.loc[abcd.male == 0, cols.extrap.predictors]).any(axis=1)].index

    # remove errors for ABC individuals for whom we do not predict earnings
    # interp (we only check age 22 since predicatablity of each year are based on the same set of outcomes)

    error_mat['male'].loc[male_interp_nix, slice(0, 8)] = np.nan
    error_mat['female'].loc[female_interp_nix, slice(0, 8)] = np.nan
    error_mat['pooled'].loc[female_interp_nix.append(male_interp_nix),
                            slice(0, 8)] = np.nan
    # extrap (we only check age 31 since predicatablity of each year are based on the same set of outcomes)

    error_mat['male'].loc[male_extrap_nix, slice(9, 45)] = np.nan
    error_mat['female'].loc[female_extrap_nix, slice(9, 45)] = np.nan
    error_mat['pooled'].loc[female_extrap_nix.append(male_extrap_nix),
                            slice(9, 45)] = np.nan

    # predict earnings
    projection_interp = {}
    projection_extrap = {}
    abc.loc[:, 'Intercept'] = [1 for j in range(abc.shape[0])]

    for sex in ['pooled', 'male', 'female']:
        if sex == 'pooled':
            abcd = abc

        elif sex == 'male':
            abcd = abc.loc[abc.male == 1]

        else:
            abcd = abc.loc[abc.male == 0]

        abcd_interp = abcd.loc[:,
                               ['Intercept'] + cols.interp.predictors + ['y']]
        abcd_extrap = abcd.loc[:,
                               ['Intercept'] + cols.extrap.predictors + ['y']]

        projection_interp[sex] = pd.DataFrame([])
        projection_extrap[sex] = pd.DataFrame([])

        for age in ages:
            if age in range(22, 30):
                if age == 22:
                    abcd_interp['y'] = 0
                params_interp_trans = pd.DataFrame(
                    params_interp[sex].loc[age].drop('rmse').T)
                interp_dot = abcd_interp.dot(
                    params_interp_trans) + error_mat[sex][[age]]
                abcd_interp['y'] = interp_dot
                projection_interp[sex] = pd.concat(
                    [projection_interp[sex], interp_dot], axis=1)

            else:

                if age == 31:
                    params_extrap[sex].loc[31]['y'] = 0
                    abcd_extrap['y'] = interp_dot
                    abcd_extrap['y'].fillna(value=0, inplace=True)
                params_extrap_trans = pd.DataFrame(
                    params_extrap[sex].loc[age].drop('rmse').T)
                extrap_dot = abcd_extrap.dot(
                    params_extrap_trans) + error_mat[sex][[age]]
                abcd_extrap['y'] = extrap_dot
                projection_extrap[sex] = pd.concat(
                    [projection_extrap[sex], extrap_dot], axis=1)

    return params_interp, params_extrap, error_mat, projection_interp, projection_extrap
def predict_abc(extrap, extrap_index, abc, verbose=True):

	# set up age range
	ages = range(21, 68)

	# set up dictionaries to store output
	params_extrap = {}
	error_mat = {}

	# set up matrices for interpolation/extrapolation parameters, and errors
	for sex in ['pooled', 'male', 'female']:
		params_extrap[sex] = pd.DataFrame([[np.nan for j in range(len(cols.extrap.predictors) + 3)] for k in range(21,68)], index = range(21,68))
		params_extrap[sex].index.names = ['age']
		params_extrap[sex].columns = ['Intercept'] + cols.extrap.predictors + ['y'] + ['rmse']
		error_mat[sex] = pd.DataFrame([])

	# obtain parameters for every age
	for age in ages:
		age_x = age - 1
		predictors = cols.extrap.predictors + ['inc_labor{}'.format(age_x)]
				
		aux = deepcopy(extrap.loc[extrap_index, :])
		
		c = 'inc_labor{}'.format(age)

	# obtain parameters for different sexes
		for sex in ['pooled', 'male', 'female']:

			if sex == 'pooled':
				data = aux
				abcd = abc
				abcd_count = abcd.shape[0]
			elif sex == 'male':
				data = aux
				abcd = abc.loc[abc.male_subject==1]
				abcd_count = abcd.loc[abcd['male_subject']==1]['male_subject'].count()
			else:
				data = aux
				abcd = abc.loc[abc.male_subject==0]
				abcd_count = abcd.loc[abcd['male_subject']==0]['male_subject'].count()

			# reset auxiliary index because sm.OLS drops some rows
			data.reset_index('id', drop=True, inplace=True)
			data.index = [j for j in range(data.shape[0])]
			# create design matrix for regressions
			fmla = '{} ~ {}'.format(c, ' + '.join(predictors))
			endog, exog = dmatrices(fmla, data, return_type='dataframe')
			exog = sm.add_constant(exog)

			# estimate coefficients
			fail_switch = 0
			try:
				model = sm.OLS(endog, exog)
				fit = model.fit()
				params = fit.params
				resid = fit.resid
			except:
				fail_switch = 1

				params = pd.Series([np.nan for j in range(1 + len(predictors))], index=['Intercept'] + cols.extrap.predictors + ['y'])
				resid = pd.Series([np.nan for j in range(endog.shape[0])])
			
   			# calculate RMSE
   			rmse = resid * resid
   			rmse =  pd.Series(sqrt(rmse.mean(axis=0)), index=['rmse'])
   			params = pd.concat([params, rmse],axis=0)
			params.rename({'inc_labor{}'.format(age_x):'y'}, inplace=True)
			params_extrap[sex].loc[age, :] = params
   			# resample the errors, and merge in with ABC IDs
			if fail_switch == 0:
   				ehat = pd.DataFrame(np.random.choice(resid, size=abcd_count))
			else:
   				ehat = pd.DataFrame([np.nan for j in range(abcd_count)])
   			abcd_ix = abcd.reset_index(level=0)
   			ehat = pd.concat([abcd_ix.loc[:,'id'], ehat], axis=1)
   			ehat.columns = ['id', age]
   			ehat.columns.name = 'age'
   			ehat.set_index('id', inplace=True)
   			error_mat[sex] = pd.concat([error_mat[sex], ehat], axis=1)
			
		if verbose:
			print 'Successful predictions, age {}, n={}'.format(age, exog.shape[0])

   	# add treatment indicator back into error matrix, add column names
	treat = abc.loc[:,'R']
	for sex in ['pooled', 'male', 'female']:
		error_mat[sex] = pd.concat([error_mat[sex], treat], axis=1, join='inner')
		params_extrap[sex].columns.name = 'variable'

	# extrap (we only check age 31 since predicatablity of each year are based on the same set of outcomes)

	error_mat['male'].loc[male_extrap_nix, slice(9,45)] = np.nan
	error_mat['female'].loc[female_extrap_nix, slice(9,45)] = np.nan
	error_mat['pooled'].loc[female_extrap_nix.append(male_extrap_nix), slice(9,45)] = np.nan

	# predict earnings
	projection_extrap = {}
	abc.loc[:, 'Intercept'] = [1 for j in range(abc.shape[0])]

	for sex in ['pooled', 'male', 'female']:
	
		if sex == 'pooled':
			abcd = abc

		elif sex == 'male':
			abcd = abc.loc[abc.male_subject==1]

		else:
			abcd = abc.loc[abc.male_subject==0]

   		abcd_extrap = abcd.loc[:, ['Intercept'] + cols.extrap.predictors + ['y']]
		
		projection_extrap[sex] = pd.DataFrame([])

		for idx in abcd.iterrows():
			age_extrap = pd.DataFrame([np.nan for k in range(21,69)], index = [range(21,69)])
			age_extrap.index.names = ['age']
			tmp_age = idx[1].loc['last_age']
			abcd_extrap.loc[idx[0], 'y'] = idx[1].loc['inc_labor_last']

			if tmp_age > 20:
				for age in range(tmp_age, 68):
					params_extrap_trans = pd.DataFrame(params_extrap[sex].loc[age].drop('rmse').T)
					extrap_dot = abcd_extrap.loc[idx[0],:].dot(params_extrap_trans) + error_mat[sex][[age]].loc[idx[0],:]
					abcd_extrap.loc[idx[0],'y'] = extrap_dot.iloc[0]
					age_extrap.loc[age] = extrap_dot.iloc[0]
				
				age_extrap.loc[69] = idx[0]

				projection_extrap[sex] = pd.concat([projection_extrap[sex], age_extrap.T], axis=0)
		projection_extrap[sex].set_index(projection_extrap[sex].loc[:,69], inplace=True, drop=True)		

	return params_extrap, error_mat, projection_extrap, abc
def predict_abc(interp, extrap, interp_index, extrap_index, abc, verbose=True):

	# set up age range
	ages = range(22, 30) + range(31, 68)

	# set up dictionaries to store output
	params_interp = {}
	params_extrap = {}
	error_mat = {}

	# set up matrices for interpolation/extrapolation parameters, and errors
	for sex in ['pooled', 'male', 'female']: 
		params_interp[sex] = pd.DataFrame([[np.nan for j in range(len(cols.interp.predictors) + 2)] for k in range(22,30)], index = range(22,30))
		params_interp[sex].index.names = ['age']
		params_interp[sex].columns = ['Intercept'] + cols.interp.predictors + ['rmse']
 
		params_extrap[sex] = pd.DataFrame([[np.nan for j in range(len(cols.extrap.predictors) + 2)] for k in range(31,68)], index = range(31,68))
		params_extrap[sex].index.names = ['age'] 
		params_extrap[sex].columns = ['Intercept'] + cols.extrap.predictors + ['rmse']
 
		error_mat[sex] = pd.DataFrame([])
 
 
	# obtain parameters for every age 
	for age in ages:

		if age in range(22, 30):
			predictors = cols.interp.predictors
			aux = deepcopy(interp.loc[interp_index, :])

		elif age in range(31, 68):
			predictors = cols.extrap.predictors
			aux = deepcopy(extrap.loc[extrap_index, :])

		c = 'inc_labor{}'.format(age)

		# obtain parameters for different sexes
		for sex in ['pooled', 'male', 'female']:

			if sex == 'pooled':
				data = aux
				abcd = abc    
				abcd_count = abcd.shape[0]
    
			elif sex == 'male':
				data = aux.loc[aux.male==1]
				abcd = abc.loc[abc.male==1]
				abcd_count = abcd.loc[abcd['male']==1]['male'].count()
			else:
				data = aux.loc[aux.male==0]
				abcd = abc.loc[abc.male==0]
				abcd_count = abcd.loc[abcd['male']==0]['male'].count()
  
			# reset auxiliary index (why?)
			data.reset_index('id', drop=True, inplace=True)
			data.index = [j for j in range(data.shape[0])]
               
			# create design matrix for regressions
			fmla = '{} ~ {}'.format(c, ' + '.join(predictors))
			endog, exog = dmatrices(fmla, data, return_type='dataframe')
			exog = sm.add_constant(exog)

			# estimate coefficients
			fail_switch = 0
			try:
				model = sm.OLS(endog, exog)
				fit = model.fit()
				params = fit.params
				resid = fit.resid
			except:
				fail_switch = 1
				params = pd.Series([np.nan for j in range(1 + len(predictors))], index=['Intercept'] + predictors)
				resid = pd.Series([np.nan for j in range(endog.shape[0])])

   			# calculate RMSE   			
   			rmse = resid * resid
   			rmse =  pd.Series(sqrt(rmse.mean(axis=0)), index=['rmse'])
   			params = pd.concat([params, rmse],axis=0)
      
   			if age in range(22,30):          
				params_interp[sex].loc[age, :] = params          
          
   			else: 
				params_extrap[sex].loc[age, :] = params
    
   			# resample the errors, and merge in with ABC IDs
			if fail_switch == 0:
   				ehat = pd.DataFrame(np.random.choice(resid, size=abcd_count))
			else:
   				ehat = pd.DataFrame([np.nan for j in range(abcd_count)])
   			abcd_ix = abcd.reset_index(level=0)
   			ehat = pd.concat([abcd_ix.loc[:,'id'], ehat], axis=1)
   			ehat.columns = ['id', age]
   			ehat.columns.name = 'age'
   			ehat.set_index('id', inplace=True)  
   			error_mat[sex] = pd.concat([error_mat[sex], ehat], axis=1)
      
		if verbose:
			print 'Successful predictions, age {}, n={}'.format(age, exog.shape[0])
   
   	# add treatment indicator back into error matrix, add column names
	treat = abc.loc[:,'R']
	for sex in ['pooled', 'male', 'female']:
		error_mat[sex] = pd.concat([error_mat[sex], treat], axis=1, join='inner')
		params_interp[sex].columns.name = 'variable'  
		params_extrap[sex].columns.name = 'variable'

	# remove errors for ABC individuals for whom we do not predict earnings
	# interp (we only check age 22 since predicatablity of each year are based on the same set of outcomes)
 
	error_mat['male'].loc[male_interp_nix, slice(0,8)] = np.nan
	error_mat['female'].loc[female_interp_nix, slice(0,8)] = np.nan
	error_mat['pooled'].loc[female_interp_nix.append(male_interp_nix), slice(0,8)] = np.nan
	# extrap (we only check age 31 since predicatablity of each year are based on the same set of outcomes)

	error_mat['male'].loc[male_extrap_nix, slice(9,45)] = np.nan
	error_mat['female'].loc[female_extrap_nix, slice(9,45)] = np.nan
	error_mat['pooled'].loc[female_extrap_nix.append(male_extrap_nix), slice(9,45)] = np.nan
 
	# predict earnings
	projection_interp = {}
	projection_extrap = {}
	abc.loc[:, 'Intercept'] = [1 for j in range(abc.shape[0])]
	for sex in ['pooled', 'male', 'female']:  
   
		if sex == 'pooled':
			abcd = abc    
			abcd_interp = abcd.loc[:, ['Intercept'] + cols.interp.predictors]   
   			abcd_extrap = abcd.loc[:, ['Intercept'] + cols.extrap.predictors]   
    
		elif sex == 'male':
			abcd = abc.loc[abc.male==1]
			abcd_interp = abcd.loc[:, ['Intercept'] + cols.interp.predictors]   
   			abcd_extrap = abcd.loc[:, ['Intercept'] + cols.extrap.predictors]     
   
		else:
			abcd = abc.loc[abc.male==0]
			abcd_interp = abcd.loc[:, ['Intercept'] + cols.interp.predictors]   
   			abcd_extrap = abcd.loc[:, ['Intercept'] + cols.extrap.predictors]     
      
		# peform projetions using dot product, add back in the errors
		projection_interp[sex] = abcd_interp.dot(params_interp[sex].drop('rmse', axis=1).T) + error_mat[sex].drop('R', axis=1).loc[:,slice(22,29)]
		projection_extrap[sex] = abcd_extrap.dot(params_extrap[sex].drop('rmse', axis=1).T) + error_mat[sex].drop('R', axis=1).loc[:,slice(31,67)]
    
	return params_interp, params_extrap, error_mat, projection_interp, projection_extrap
def predict_abc(interp, extrap, interp_index, extrap_index, weight, interp_weights, extrap_weights, cs, abc, verbose=True):

	# set up age range
	ages = range(22, 30) + range(31, 68)

	# set up dictionaries to store output
	params_interp = {}
	params_extrap = {}
	error_mat = {}

	# set up matrices for interpolation/extrapolation parameters, and errors
	for sex in ['pooled', 'male', 'female']:
		params_interp[sex] = pd.DataFrame([[np.nan for j in range(len(cols.interp.predictors) + 3)] for k in range(22,30)], index = range(22,30))
		params_interp[sex].index.names = ['age']
		params_interp[sex].columns = ['Intercept'] + cols.interp.predictors + ['y'] + ['rmse']

		params_extrap[sex] = pd.DataFrame([[np.nan for j in range(len(cols.extrap.predictors) + 3)] for k in range(31,68)], index = range(31,68))
		params_extrap[sex].index.names = ['age']
		params_extrap[sex].columns = ['Intercept'] + cols.extrap.predictors + ['y'] + ['rmse']
		error_mat[sex] = pd.DataFrame([])
	
	# obtain parameters for every age
	for age in ages:
			
		if age in range(22, 30):
			aux = deepcopy(interp.loc[interp_index, :])
			if age == 22:

				interp_weights.reset_index(inplace=True)
				del interp_weights['draw']
				interp_weights.set_index('id', inplace=True, drop=True)
				weight_array = deepcopy(interp_weights.loc[pd.IndexSlice[interp_index],:])

			age_x = age - 1
			predictors = cols.interp.predictors + ['inc_labor{}'.format(age_x)]


		elif age in range(31, 68):
			aux = deepcopy(extrap.loc[extrap_index, :])
			if age == 31:
				age_x = 29
				predictors = cols.extrap.predictors + ['inc_labor{}'.format(age_x)]

			else: 
				age_x = age - 1
				predictors = cols.extrap.predictors + ['inc_labor{}'.format(age_x)]
			
			
			
			if age == 31:
				extrap_index_weight = [x[1] for x in extrap_index]

				extrap_weights.reset_index(inplace=True)
				del extrap_weights['draw']
				extrap_weights.set_index('id', inplace=True, drop=True)
				weight_array = deepcopy(extrap_weights.loc[extrap_index_weight,:])

		c = 'inc_labor{}'.format(age)

		# drop black
		# drop black
		aux = aux.loc[aux.black == 1]

		# obtain parameters for different sexes
		for sex in ['pooled', 'male', 'female']:

			if sex == 'pooled':
				data = aux
				abcd = abc
				abcd_count = abcd.shape[0]

			elif sex == 'male':
				data = aux.loc[aux.male==1]
				abcd = abc.loc[abc.male==1]
				abcd_count = abcd.loc[abcd['male']==1]['male'].count()

			else:
				data = aux.loc[aux.male==0]
				abcd = abc.loc[abc.male==0]
				abcd_count = abcd.loc[abcd['male']==0]['male'].count()
		
			if weight == 'treat':
				abcd = abcd.loc[abcd.R==1]
			elif weight == 'control':
				abcd = abcd.loc[abcd.R==0]
			
			# reset auxiliary index (because dmatrices won't use id)
			data.reset_index('id', drop=True, inplace=True)
			data.index = [j for j in range(data.shape[0])]
			
			weight_array.reset_index('id', drop=True, inplace=True)
			weight_array.index = [j for j in range(weight_array.shape[0])]

			#weight_array = weight_array[data.index]

			# create design matrix for regressions
			fmla = '{} ~ {}'.format(c, ' + '.join(predictors))
			endog, exog = dmatrices(fmla, data, return_type='dataframe')
			exog = sm.add_constant(exog)
			exog_index = [x for x in exog.index]
			weight_forWLS = weight_array.loc[pd.IndexSlice[exog_index]]
			weight_type = 'wtabc_allids_c' + cs + '_' + weight
			weight_forWLS = weight_forWLS.loc[:, weight_type]
			weight_forWLS.dropna(axis=0, inplace=True)
			
			exog = exog.loc[weight_forWLS.index,:]
			endog = endog.loc[weight_forWLS.index,:]
			# estimate coefficients
			fail_switch = 0
			try:
				model = sm.WLS(endog, exog, weights=weight_forWLS)
				fit = model.fit()
				params = fit.params
				resid = fit.resid
			except:
				fail_switch = 1
				if age in range(22, 30):
					params = pd.Series([np.nan for j in range(1 + len(predictors))], index=['Intercept'] + cols.interp.predictors + ['y'])
				else:
					params = pd.Series([np.nan for j in range(1 + len(predictors))], index=['Intercept'] + cols.extrap.predictors + ['y'])
				resid = pd.Series([np.nan for j in range(endog.shape[0])])
			
			# calculate RMSE
			rmse = resid * resid
			rmse =  pd.Series(sqrt(rmse.mean(axis=0)), index=['rmse'])
			params = pd.concat([params, rmse],axis=0)
			params.rename({'inc_labor{}'.format(age_x):'y'}, inplace=True)
			if age in range(22,30):
				params_interp[sex].loc[age, :] = params
			else:
				params_extrap[sex].loc[age, :] = params
			# resample the errors, and merge in with ABC IDs
			if fail_switch == 0:
				ehat = pd.DataFrame(np.random.choice(resid, size=abcd_count))
			else:
				ehat = pd.DataFrame([np.nan for j in range(abcd_count)])
			abcd_ix = abcd.reset_index(level=0)
			ehat = pd.concat([abcd_ix.loc[:,'id'], ehat], axis=1)
			ehat.columns = ['id', age]
			ehat.columns.name = 'age'
			ehat.set_index('id', inplace=True)
			error_mat[sex] = pd.concat([error_mat[sex], ehat], axis=1)

		if verbose:
			print 'Successful predictions, age {}, n={}'.format(age, exog.shape[0])

  	 # add treatment indicator back into error matrix, add column names
	treat = abc.loc[:,'R']
	for sex in ['pooled', 'male', 'female']:
		error_mat[sex] = pd.concat([error_mat[sex], treat], axis=1, join='inner')
		params_interp[sex].columns.name = 'variable'
		params_extrap[sex].columns.name = 'variable'

	male_interp_nix = abcd.loc[abcd.male==1].loc[pd.isnull(abcd.loc[abcd.male==1, cols.interp.predictors]).any(axis=1)].index
	female_interp_nix = abcd.loc[abcd.male==0].loc[pd.isnull(abcd.loc[abcd.male==0, cols.interp.predictors]).any(axis=1)].index

	male_extrap_nix = abcd.loc[abcd.male==1].loc[pd.isnull(abcd.loc[abcd.male==1, cols.extrap.predictors]).any(axis=1)].index
	female_extrap_nix = abcd.loc[abcd.male==0].loc[pd.isnull(abcd.loc[abcd.male==0, cols.extrap.predictors]).any(axis=1)].index

	# remove errors for ABC individuals for whom we do not predict earnings
	# interp (we only check age 22 since predicatablity of each year are based on the same set of outcomes)

	error_mat['male'].loc[male_interp_nix, slice(0,8)] = np.nan
	error_mat['female'].loc[female_interp_nix, slice(0,8)] = np.nan
	error_mat['pooled'].loc[female_interp_nix.append(male_interp_nix), slice(0,8)] = np.nan
	# extrap (we only check age 31 since predicatablity of each year are based on the same set of outcomes)

	error_mat['male'].loc[male_extrap_nix, slice(9,45)] = np.nan
	error_mat['female'].loc[female_extrap_nix, slice(9,45)] = np.nan
	error_mat['pooled'].loc[female_extrap_nix.append(male_extrap_nix), slice(9,45)] = np.nan

	# predict earnings
	projection_interp = {}
	projection_extrap = {}
	abc.loc[:, 'Intercept'] = [1 for j in range(abc.shape[0])]
	
	for sex in ['pooled', 'male', 'female']:
		if sex == 'pooled':
			abcd = abc

		elif sex == 'male':
			abcd = abc.loc[abc.male==1]

		else:
			abcd = abc.loc[abc.male==0]

		abcd_interp = abcd.loc[:, ['Intercept'] + cols.interp.predictors + ['y']]
		abcd_extrap = abcd.loc[:, ['Intercept'] + cols.extrap.predictors + ['y']]
		
		projection_interp[sex] = pd.DataFrame([])
		projection_extrap[sex] = pd.DataFrame([])

		for age in ages: 
			if age in range(22, 30):
				if age == 22:
					abcd_interp['y'] = 0 
				params_interp_trans = pd.DataFrame(params_interp[sex].loc[age].drop('rmse').T)
				interp_dot = abcd_interp.dot(params_interp_trans) + error_mat[sex][[age]]
				abcd_interp['y'] = interp_dot
				projection_interp[sex] = pd.concat([projection_interp[sex], interp_dot], axis=1)	

			else:

				if age == 31:
					params_extrap[sex].loc[31]['y'] = 0
					abcd_extrap['y'] = interp_dot	
					abcd_extrap['y'].fillna(value=0, inplace=True)
				params_extrap_trans = pd.DataFrame(params_extrap[sex].loc[age].drop('rmse').T)
				extrap_dot = abcd_extrap.dot(params_extrap_trans) + error_mat[sex][[age]]
				abcd_extrap['y'] = extrap_dot
				projection_extrap[sex] =pd.concat([projection_extrap[sex],extrap_dot],axis=1)

			
	return params_interp, params_extrap, error_mat, projection_interp, projection_extrap