def nber_convert_dta_to_hdf(source_dir, target_dir, index='year'):
	if index == 'year':
		fn = target_dir + "nber_year.h5"
		store = pd.HDFStore(fn, complevel=9, complib='zlib')
		for year in range(1962, 2000+1, 1):
			fn = source_dir + "wtf%s.dta" % str(year)[2:]
			print "Loading Year: %s from file: %s" % (year, fn)
			data = pd.read_stata(fn)
			store.put('Y'+str(year), data, format='table')
		print "HDF File Saved ..."
		print store
		store.close()
	else:
		data = pd.DataFrame()
		for year in range(1962, 2000+1, 1):
			fn = source_dir + "wtf%s.dta" % str(year)[2:]
			print "Loading Year: %s from file: %s" % (year, fn)
			data = data.append(pd.read_stata(fn))
		fn = target_dir + "nber.h5"
		store = pd.HDFStore(fn, complevel=9, complib='zlib')
		store.put('nber', data, format='table')
		print "HDF File Saved ..."
		print store
		store.close()
	print "Convert DTA to HDF Finished!"
def load_trade4digit_country():
    prescriptives = pd.read_stata(prefix_path("Trade/exp_ecomplexity_rc.dta"))

    exports = pd.read_stata(prefix_path("Trade/exp_rpy_rc_p4.dta"))
    exports = exports.rename(columns={"X_rpy_d": "export_value",
                                      "NP_rpy": "export_num_plants"})
    imports = pd.read_stata(prefix_path("Trade/imp_rpy_rc_p4.dta"))
    imports = imports.rename(columns={"X_rpy_d": "import_value",
                                      "NP_rpy": "import_num_plants"})

    descriptives = exports.merge(imports, on=["yr", "r", "p"], how="outer")
    descriptives = descriptives.fillna({
        "export_value": 0,
        "export_num_plants": 0,
        "import_value": 0,
        "import_num_plants": 0,
    })

    combo = prescriptives.merge(descriptives,
                                left_on=["yr", "r", "p4"],
                                right_on=["yr", "r", "p"])

    combo = combo[combo.yr.between(YEAR_MIN_TRADE, YEAR_MAX_TRADE)]
    combo["r"] = "COL"
    return combo
Example #3
0
File: code.py Project: rnyang/STAHP
 def stata_use(self, args, opts=[]):
     
     def parse_filename(fname_arg):
         ext = '.dta'
         if fname_arg.endswith(ext):
             return(fname_arg)
         return(fname_arg + ext)
     
     ## Defaults
     labels = True
     
     ## Opts
     if "nolabel" in opts or "nol" in opts:
         labels = False
     
     if "clear" not in opts:
         if self.data is not None:
             print "you must start with an empty data set"
             return
     
     if 'using' not in args:
         fname = parse_filename(args[0])
         self.data = pd.read_stata(fname, convert_categoricals=labels)
     else:
         fname = parse_filename(args[args.index('using')+1])
         self.data = pd.read_stata(fname, convert_categoricals=labels)
         if 'if' in args:
             ifcondition = args[args.index('if')+1:args.index('using')] # todo: 'in'
             self.data = self.if_filter(ifcondition)
             varlist = args[:args.index('if')]
         else:
             varlist = args[:args.index('using')]
         if len(varlist) > 0:
             self.data = self.data[varlist]
    def compare(self, seuil_abs = 100, seuil_rel = 0.10):
        '''
        Fonction qui comparent les calculs d'OF et et de TaxIPP
        Gestion des outputs
        '''
        dta_output = self.paths['dta_output']
        ipp_output = read_stata(dta_output).fillna(0)
        dta_input = self.paths['dta_input']
        ipp_input =  read_stata(dta_input).fillna(0)
        openfisca_output = self.openfisca_outputput.fillna(0)
        openfisca_input = self.simulation.input_table.table
        ipp2of_output_variables = self.dic_var_output

        check_list = ['csg_sal_ded', 'irpp_net_foy', 'af_foys'] # 'csg_sal_ded',
        print self.dic_param
        
        for ipp_var in check_list:
            of_var = ipp2of_output_variables[ipp_var]
            entity = self.simulation.prestation_by_name[of_var].entity
            
            if entity == 'ind':
                conflict = ((ipp_output[ipp_var] - openfisca_output[of_var].abs()).abs() < THRESHOLD)
                print conflict.to_string()
                print ipp_output.loc[not_(conflict), ipp_var].to_string()
                print openfisca_output.loc[not_(conflict), of_var].to_string()
                                
                print ipp_input.loc[not_(conflict), ].to_string()
                print openfisca_input.loc[not_(conflict), self.relevant_input_variables()].to_string()

                #error_diag()
                
                # TODO: finish by calling error_diag
            elif entity == "fam":
            
                pass
            
            
            elif entity == "foy":
                openfisca_foy = openfisca_output.loc[ openfisca_input.quifoy == 0, of_var]             
                ipp_foy = ipp_output.loc[ openfisca_input.quifoy == 0, ipp_var] 
                print ipp_foy

                conflict = ((ipp_foy - openfisca_foy.abs()).abs() > THRESHOLD)
                print conflict.to_string()
                
            elif entity == "men":
                pass
        def _diff(seuil_abs, seuil_rel):
            for k, v in dic.items() :
                diff_abs =  ipp_output[k].mean() - openfisca_output[v].mean()
                
                if diff_abs > seuil_abs :
                    print " Différence absolue pour ", k, ' : ', diff_abs

                diff_rel = (ipp_output.loc[(ipp_output[k] != 0) & (openfisca_output[v] != 0), k] /openfisca_output.loc[(ipp_output[k] != 0) & (openfisca_output[v] != 0), v] ).mean()

                if (diff_rel > seuil_rel) & (diff_rel is not None) :
                    print " Différence relative pour  ", k, ' : ', diff_rel
Example #5
0
def load_pwt_data(version=80, deltas=False):
    """
    Load the Penn World Tables data as a Pandas Panel object. Function expects
    a local copy of the PWT data file in the working directory. If no local copy
    exists, then one will be downloaded automatically (assuming that you have a
    working internet connection!).

    Arguments:
 
        version:  (int) Version number for PWT data. Default is 80 (which is the 
                  most recent version).
                  
        deltas:   (boolean) Whether or not you wish to load the data on 
                  depreciation rates (which is included in a separate .dta 
                  file). Default is False.
    Returns:

        pwt:    A Pandas Panel object containing the Penn World Tables data.
        
    TODO: Work out a way to merge Pandas Panel objects.
    
    """        
    # first check for a local copy of PWT
    try: 
        data = pd.read_stata('pwt' + str(version) + '.dta')
                
    # otherwise, 
    except IOError:  
        get_pwt_data(version)
        data = pd.read_stata('pwt' + str(version) + '.dta')
     
    # convert to a panel object   
    data.index =[data['year'], data['countrycode']]
    data = data.to_panel()
        
    if deltas == True:
        
        # load the separate file containing the depreciation rates
        dep_rates = pd.read_stata('depreciation_rates.dta')
        
        # convert to panel 
        dep_rates.index = [dep_rates['year'], dep_rates['countrycode']]
        dep_rates = dep_rates.to_panel()

        return [data, dep_rates]
        
    else:
        
        return data
def load_dta(filename, chunksize = 5000, **kwargs):
    # use columns as a list
    reader = pd.read_stata(filename, iterator=True, **kwargs)
    df = pd.DataFrame()

    try:
        chunk = reader.get_chunk(chunksize)

        i = 0
        while len(chunk) > 0:
            i += 1
            df = df.append(chunk, ignore_index=True)

            print 'Loaded {} rows...'.format(len(df))
            sys.stdout.flush()

            chunk = reader.get_chunk(chunksize)

        print 'Done!'
        return df

    except StopIteration: # I guess this is an acceptable exception?
        print 'Done!'
        return df

    except Exception as e:
        print type(e).__name__ + ': ' + str(e)
        return df
    def fill_hdf_from_stata(self, table):
        from pandas import read_stata
        assert table in self.tables, "Table {} is not a filed table".format(table)
        stata_file = self.tables[table]["stata_file"]
        try:
            variables = self.tables[table]['variables']
        except:
            variables = None
        if not os.path.isfile(stata_file):
            raise Exception("file_path {} do not exists".format(stata_file))

        log.info("Inserting stata table {} in file {} in HDF file {} at point {}".format(
            table,
            stata_file,
            self.hdf5_file_path,
            table,
            )
        )
        stored_dataframe = read_stata(stata_file)
        store_path = table
        if variables is not None:
            log.info('variables asked by the user: {}'.format(variables))
            variables_stored = list(set(variables).intersection(set(stored_dataframe.columns)))
            log.info('variables stored: {}'.format(variables_stored))
            stored_dataframe[variables_stored].to_hdf(self.hdf5_file_path, store_path, format = 'table', append = False)
        else:
            try:
                stored_dataframe.to_hdf(self.hdf5_file_path, store_path, format = 'table', append = False)
            except:
                stored_dataframe.to_hdf(self.hdf5_file_path, store_path, append = False)
        gc.collect()
def getPats():
    """
    Loads NBER patent data, 1976-2006, and returns two data frames. One df with biotech patents, defined
    by the icl and icl_class, and one df with ai patents, defined by nclass.
    """
    # Import patent data
    orig7606 = pd.read_stata(nberPath+'/orig_gen_76_06.dta')
    pat7606 = pd.DataFrame.from_csv(nberPath+'/pat76_06_ipc.csv')

    # Since pat76-06 has multiple records for each patent = number of assigness, remove duplicates so there is one record per patent
    pat7606 = pat7606.drop_duplicates(cols=['patent'])
    pat7606 = pat7606.drop(['uspto_assignee','year'],axis=1)

    # Put originality, generality, and citation measures onto the pat76-06 dataframe
    pat7606 = pd.merge(pat7606, orig7606, on=['patent'], how='left')

    # Filter down to only bio-pats using hard-coded classifications from (REFERENCE)
    bioICL = pd.DataFrame(['A01H  100','A01H  400','A61K 3800','A61K 3900','A61K 4800','C02F  334','C07G 1100','C07G 1300','C07G 1500','C07K  400','C07K 1400','C07K 1600','C07K 1700','C07K 1900','G01N 27327','G01N 3353','G01N 33531','G01N 33532','G01N 33533','G01N 33534','G01N 33535','G01N 33536','G01N 33537','G01N 33538','G01N 33539','G01N 3354','G01N 33541','G01N 33542','G01N 33543','G01N 33544','G01N 33545','G01N 33546','G01N 33547','G01N 33548','G01N 33549','G01N 3355','G01N 33551','G01N 33552','G01N 33553','G01N 33554','G01N 33555','G01N 33556','G01N 33557','G01N 33558','G01N 33559','G01N 3357','G01N 33571','G01N 33572','G01N 33573','G01N 33574','G01N 33575','G01N 33576','G01N 33577','G01N 33578','G01N 33579','G01N 3368','G01N 3374','G01N 3376','G01N 3378','G01N 3388','G01N 3392'], columns=['icl'])
    bioICLclass = pd.DataFrame(['C12M','C12N','C12P','C12Q','C12S'],columns=['icl_class'])
    bioPats1 = pat7606.merge(bioICL,on=['icl'])
    bioPats2 = pat7606.merge(bioICLclass,on=['icl_class'])
    bioPats = pd.concat([bioPats1,bioPats2])
    
    # Filter down to ai pats
    aiPats = pat7606[pat7606.nclass==706]
    
    return bioPats, aiPats
Example #9
0
def main():
	print "Enter a filename"
	filename = raw_input()
	if ".dta" in filename:
		df = pd.read_stata(os.path.join(RAW_DATA, filename))
		# I want to treat these as string values rather than actually missing
	elif ".csv" in filename:
		df = pd.read_csv(os.path.join(RAW_DATA, filename))
		df.fillna("missing")
	else:
		sys.exit("File type not detected or supported.  .csv or .dta accepted")
	df.columns = [x.lower() for x in df.columns]
	deviceList = []
	for i in range(len(df.index)):
		observation = df.loc[i]
		os_type = determine_os_type(observation)
		df.loc[i,'devicetype'] = os_type
	#print df.at[0, 'responseid']
	for x in sorted(get_column_levels(df, COLUMNS_DICT['useragent'])):
		print "Warning: uknown device type for entries labeled   {}".format(x)
	# To save space I'm only going to write an id variable and the newly created devicetype
	try:
		df = df[['responseid', 'devicetype','bmi_useragent']]
	except KeyError:
		df = df[['id', 'devicetype', 'bmi_useragent' ]]
	df.to_stata(os.path.join(OUT_DIR,'{}_devices.dta'.format(filename.replace(".dta","").replace(".csv",""))), write_index=False)
Example #10
0
def dta2dbf(dta_path,dbf_path):
    """
    Wrapper function to convert a stata dta file into a dbf file. 

    __author__  = "Luc Anselin <*****@*****.**> "
    
    Uses df2dbf to write the dbf files from a pandas
    DataFrame. Uses all default settings for df2dbf (see docs
    for specifics).
    ...

    Arguments
    ---------

    dta_path    : str
                  Path to the Stata dta file
    dbf_path    : str
                  Path to the output dbf file
                  
    Returns
    -------
    dbf_path    : path to output file
    """
    
    db = pd.read_stata(dta_path)
    dp = df2dbf(db,dbf_path)
    return dp    
Example #11
0
 def __init__(self, filename):
     if(filename.endswith('csv')):
         pd.DataFrame.__init__(self, data=pd.read_csv(filename))
     elif(filename.endswith('tab')) :
         pd.DataFrame.__init__(self, data=pd.read_csv(filename, sep='\t'))
     elif(filename.endswith('dta')):
         pd.DataFrame.__init__(self, data=pd.read_stata(filename))
def dta_pandas(fp):
    """Read and convert a dta file to JSON format using the pandas library
    :param fp: File pointer object
    :return: tuple of table headers and data
    """
    dataframe = pandas.read_stata(fp)
    return data_from_dataframe(dataframe)
Example #13
0
	def loadSTATA(self):
		try:
			df = pd.read_stata(str(self.filename))
		except:
			print "unexpected error occured"
			return
		print df.head()

		l = list(df.columns)
		print df.head()
		head = self.tableWidget_3.horizontalHeader()
		head.setStretchLastSection(True)
		nrow = len(df.index)
		if nrow>100:
			nrow = 100
		else:
			nrow = nrow

		#self.datatable = QtGui.QTableWidget(parent=self)
		self.tableWidget_3.setColumnCount(len(df.columns))
		self.tableWidget_3.setRowCount(nrow)
		for i in range(nrow):
			for j in range(len(df.columns)):
				self.tableWidget_3.setItem(i,j,QtGui.QTableWidgetItem(str(df.iget_value(i, j))))
		self.tableWidget_3.setHorizontalHeaderLabels(l)

		self.headerName = l
		self.nonSelectedVariables = self.headerName
		self.data = df
		st = str(nrow)+" of "+str(len(df.index))+" rows has been shown"
		self.label.setText(st)
		self.label.setVisible(True)
		self.initDict()
		self.initComboBox()
Example #14
0
def webuse(data, baseurl='https://www.stata-press.com/data/r11/', as_df=True):
    """
    Download and return an example dataset from Stata.

    Parameters
    ----------
    data : str
        Name of dataset to fetch.
    baseurl : str
        The base URL to the stata datasets.
    as_df : bool
        Deprecated. Always returns a DataFrame

    Returns
    -------
    dta : DataFrame
        A DataFrame containing the Stata dataset.

    Examples
    --------
    >>> dta = webuse('auto')

    Notes
    -----
    Make sure baseurl has trailing forward slash. Doesn't do any
    error checking in response URLs.
    """
    url = urljoin(baseurl, data+'.dta')
    return read_stata(url)
Example #15
0
 def _open_stata_file(self, path):
     stata_file = pd.read_stata(
         path,
         convert_categoricals=False,
         order_categoricals=False,
         iterator=True,
     )
     return stata_file
Example #16
0
def output():
  files = glob.glob("".join((gen_input, "*.dta")))
  for file in files:
    stata_file = pandas.read_stata(file, encoding='latin1')
    name = basename(file)
    stata_file = stata_file.drop("level_0", 1)
    name = name.replace(".dta", "gen.dta")
    stata_file.to_stata("".join((gen_output, name)))
Example #17
0
def get_pwt_data(version=80, extract=True):
    """
    Load the Penn World Tables data. If no local copy of the data is found, then
    a copy will be downloaded from the web.

    Arguments:
 
        version: (int) Version number for PWT data. Default is 80 (which is the 
                 most recent version).
    
        extract: (boolean) Whether or not you wish to save local copy of the 
                 PWT data in your working directory. Default is True.

    Returns:

        pwt:    A Pandas Panel object containing the Penn World Tables data.

    TODO: Convert time index to datetime object.

    """        
    # first check for a local copy of PWT
    try: 
        path = 'pwt' + str(version) + '.dta'
        pwt = pd.read_stata(path)

    # otherwise, download the appropriate zip file 
    except IOError:  
        url = ('http://www.rug.nl/research/GGDC/data/pwt/V' + str(version) +
               '/pwt' + str(version) + '.zip') 
        archive = zipfile.ZipFile(StringIO(urlopen(url).read()), 'r') 

        # to extract or not to extract...
        tmp_file = 'pwt' + str(version) + '.dta'
        
        if extract == True:
            archive.extractall()
            pwt = pd.read_stata(tmp_file)
        else:
            pwt = archive.read(tmp_file)
            pwt = pd.read_stata(StringIO(pwt))         
    
    # convert to Pandas Panel object
    pwt.index = [pwt['year'], pwt['countrycode']]
    pwt = pwt.to_panel()
    
    return pwt
def nber_convert_dta_to_csv(source_dir, target_dir):
	for year in range(1962, 2000+1, 1):
		fn = source_dir + "wtf%s.dta" % str(year)[2:]
		print "Loading Year: %s from file: %s" % (year, fn)
		data = pd.read_stata(fn)
		fn = target_dir + "wtf%s.csv" % str(year)[2:]
		print "Converting Year: %s from file: %s" % (year, fn)
		data.to_csv(fn, index=False, quoting=csv.QUOTE_NONNUMERIC)
		print "Convert DTA to CSV Finished!"
Example #19
0
def read_stata(stata_name):
    logger.info("read \"" + stata_name + "\"")
    data = pd.read_stata(
        stata_name,
        iterator=True,
        convert_categoricals=False,
        )
    d, m = parse_dataset(data, stata_name)
    return d, m
Example #20
0
def batch_process(func, prefix):
    for filename in filenames[first:last]:
        basename = os.path.splitext(os.path.basename(filename))[0]
        outname = tmp + "/" + prefix + basename + '.csv'

        if os.path.isfile(outname):
            continue

        func(pd.read_stata(filename)).to_csv(outname, header=True)
Example #21
0
 def _load_data_stata(self, symbol, file_path):
     """ Load .dta data files """
     # Load the data file with no header information, indexed on date
     self.symbol_data[symbol] = pd.read_stata(
         file_path,
         header=0,
         index_col=0,
         parse_dates=True
     ).sort_index()
Example #22
0
def output():
  files = glob.glob("".join((cleaning_input, "*.dta")))
  for file in files:
    stata_file = pandas.read_stata(file, encoding='latin1')
    name = basename(file)
    print(name)
    stata_file = stata_file.drop("level_0", 1)
    name = re.sub("-raw", "", name)
    stata_file.to_stata("".join((cleaning_output, name)))
def load_trade4digit_municipality():
    exports = pd.read_stata(prefix_path("Trade/exp_rpy_r5_p4.dta"))
    exports = exports.rename(columns={"X_rpy_d": "export_value",
                                      "NP_rpy": "export_num_plants"})
    imports = pd.read_stata(prefix_path("Trade/imp_rpy_r5_p4.dta"))
    imports = imports.rename(columns={"X_rpy_d": "import_value",
                                      "NP_rpy": "import_num_plants"})

    descriptives = exports.merge(imports, on=["yr", "r", "p"], how="outer")
    descriptives = descriptives.fillna({
        "export_value": 0,
        "export_num_plants": 0,
        "import_value": 0,
        "import_num_plants": 0,
    })

    descriptives = descriptives[descriptives.yr.between(YEAR_MIN_TRADE, YEAR_MAX_TRADE)]
    return descriptives
 def test_export_data_G(self):
     #-pyeconlab-#
     self.obj.construct_sitc_dataset(data_type='export', dataset="G", product_level=3, sitc_revision=2, report=False, verbose=False)                                              
     #-stata-#
     self.G = pd.read_stata(TEST_DATA_DIR + "nberwtf_stata_export_sitcr2l3_1962to2000_G.dta")
     self.G.sort(['year', 'eiso3c', 'sitc3'], inplace=True)
     self.G.reset_index(inplace=True)
     del self.G['index']
     assert_allclose(self.obj.dataset['value'].values, self.G['value'].values)
     del self.G
 def test_bilateral_data_F(self):
     #-pyeconlab-#
     self.obj.construct_sitc_dataset(data_type='trade', dataset="F", product_level=3, sitc_revision=2, report=False, verbose=False)  
     #-stata-#
     self.F = pd.read_stata(TEST_DATA_DIR + "nberwtf_stata_trade_sitcr2l3_1962to2000_F.dta")
     self.F = self.F.sort(['year', 'eiso3c', 'iiso3c', 'sitc3'])
     self.F = self.F.reset_index()
     del self.F['index']
     assert_allclose(self.obj.dataset['value'].values, self.F['value'].values)
     del self.F
 def test_bilateral_data_A(self):
     # -pyeconlab-#
     self.obj.construct_dataset_SC_CNTRY_SR2L3_Y62to00_A(data_type="trade")
     # -stata-#
     self.A = pd.read_stata(TEST_DATA_DIR + "A-nberfeenstrawtf_do_stata_basic_country_sitc3_bilateral.dta")
     self.A.sort(["year", "eiso3c", "iiso3c", "sitc3"], inplace=True)
     self.A.reset_index(inplace=True)
     del self.A["index"]
     assert_allclose(self.obj.dataset["value"].values, self.A["value"].values)
     del self.A
Example #27
0
 def setup_class(cls):
     """Stata reg output from `sysuse auto; reg price mpg`"""
     cls.init(cls)
     test_path = path.split(path.relpath(__file__))[0]
     auto_path = path.join(test_path, 'data', 'auto.dta')
     autodata = pd.read_stata(auto_path)
     y = 'price'
     x = ['mpg', 'length']
     cls.result = reg(autodata, y, x, cluster='gear_ratio', addcons=True)
     cls.expected = ols_cluster
 def test_import_data_A(self):                                                                                  
     #-pyeconlab-#
     self.obj.construct_sitc_dataset(data_type='import', dataset="A", product_level=3, sitc_revision=2, report=False, verbose=False)
     #-stata-#
     self.A = pd.read_stata(TEST_DATA_DIR + "bacihs96_stata_import_sitcr2l3_1998to2012_A.dta")
     self.A.sort(['year', 'iiso3c', 'sitc3'], inplace=True)
     self.A.reset_index(inplace=True)
     del self.A['index']
     assert_allclose(self.obj.dataset['value'].values, self.A['value'].values)
     del self.A
Example #29
0
def get_labor_data():

    # read in "raw" CPS data to calculate moments
    # these data were actually cleaned in hours_data_cps_setup.do
    fileDir = os.path.dirname(os.path.realpath('__file__'))
    filename = os.path.join(fileDir, '../Data/Current_Population_Survey/cps_est_ability_hours_1992to2013.dta')
    filename = os.path.abspath(os.path.realpath(filename))
    cps = pd.read_stata(filename, columns=['year', 'age', 'hours',
                                           'hours_unit', 'wtsupp'])
    return cps
 def test_import_data_D(self):  # a to_import would be more efficient here
     # -pyeconlab-#
     self.obj.construct_dataset_SC_CNTRY_SR2L3_Y62to00_D(data_type="import")
     # -stata-#
     self.D = pd.read_stata(TEST_DATA_DIR + "D-nberfeenstrawtf_do_stata_basic_country_sitc3_imports.dta")
     self.D.sort(["year", "iiso3c", "sitc3"], inplace=True)
     self.D.reset_index(inplace=True)
     del self.D["index"]
     assert_allclose(self.obj.dataset["value"].values, self.D["value"].values)
     del self.D
Example #31
0
respgender.columns = ['prop']

respgender = respgender.rename(columns={'level_1': 'category'})

# make educ with cats in col
educcats = [c for c in descr.index if 'educ_cat' in c]
respeduc = descr[descr.index.isin(educcats)][['group', 'mean']]

# rows to cols pivot
#
respeduc = pd.pivot_table(respeduc, index=['group', 'variable']).reset_index()

##agecategories with weighted stats
# load in clean data

clean = pd.read_stata(clean / "PAPAB Impact study - Ready for analysis.dta")

#
ages_df = clean.loc[:, [
    'age', 'pip_generation_clean', 'weight_generation', 'weight_generation_inv'
]]
print(ages_df.age.isna().sum())  # no missing values
bins = [0, 24, 34, 44, 54, 64, np.inf]
names = ['<24', '25-34', '35-44', '45-54', '55-64', '65+']
ages_df['agerange'] = pd.cut(ages_df['age'], bins, labels=names)
ages_dum = pd.get_dummies(ages_df['agerange'])

ages_df = pd.concat([ages_df, ages_dum], axis=1, sort=False)

#now aggregate averages across cats + a weight for all pip
Example #32
0
import pandas as pd
import difflib

asha_df = pd.read_stata('E:/JSY MIS data/ASHA/data_asha_nrhm_without15.dta')
dhs_df = pd.read_stata('E:/JSY MIS data/ASHA/st_dist_dhs_2.dta')

print(len(asha_df))


def fix_spelling(x, source):
    try:
        return difflib.get_close_matches(x, source)[0]
    except Exception:
        return "NA"


asha_df['s_nrhm'] = asha_df['s_nrhm'].apply(
    lambda x: fix_spelling(x, dhs_df['s_nrhm']))

asha_df['sdist'] = asha_df['sdist'].apply(
    lambda x: fix_spelling(x, dhs_df['sdist']))

merged_df = asha_df.merge(dhs_df, 'inner', on=['s_nrhm', 'sdist'])

merged_df.to_stata('E:/JSY MIS data/ASHA/asha_dhs_merge_state_dist.dta')

print(merged_df.head(25))

print(len(merged_df))
def prepare_data_figure12(data_cwhsa, data_cwhsb):
    """
    Take CWHS data set for FICA earnings and prepare it such that it can be used for
    plotting Figure 1 and 2.

    Parameters
    ----------
    data_cwhsa : pd.DataFrame
        CWHS data until 1978.
    data_cwhsb : pd.DataFrame
        CWHS data from 1978 on.

    Returns
    -------
    data_temp : pd.DataFrame
        Contains the FICA earnings in 1978 dollar terms across those eligible for
        the draft lottery for both whites and nonwhites for different years and birth cohorts.
    difference : pd.DataFrame
        Contains the difference in FICA earnings in 1978 dollar terms between eligible
        and ineligible for different groups (by ethnicity, year and birth cohort).

    """
    # read data for years 64 to 77
    data = data_cwhsa
    # declare that it is FICA earnings
    data["type"] = "TAXAB"

    # read data for the years after 77
    temp_data = data_cwhsb
    data = data.append(temp_data)

    data = data.loc[(data["year"] > 65) & (data["byr"] >= 50)]

    # create eligibility dummy
    data["eligible"] = 0
    data.loc[((data["byr"] >= 44) & (data["byr"] <= 50) &
              (data["interval"] <= 39))
             | ((data["byr"] == 51) & (data["interval"] <= 25))
             | (((data["byr"] == 52) | (data["byr"] == 53)) &
                (data["interval"] <= 19)), "eligible", ] = 1

    # add the cpi to the data
    data_cpi = pd.read_stata("data/cpi_angrist1990.dta")
    data = pd.merge(data, data_cpi, on="year")

    # keep only FICA earnings
    data = data.loc[data["type"] == "TAXAB"]

    # create the average earnings for those with nonzero earnings
    data["earnings"] = data["vmn1"] / (1 - data["vfin1"])
    # create the sample size for those with nonzero earnings
    data["weights"] = data["vnu1"] * (1 - data["vfin1"])

    # create real earnings in 1978 terms
    data["cpi"] = (data["cpi"] /
                   data.loc[data["year"] == 78, "cpi"].mean()).round(3)
    data["real_earnings"] = data["earnings"] / data["cpi"]

    # adjust earnings like in description below Figure 1 in the paper
    for cohort, addition in [(50, 3000), (51, 2000), (52, 1000)]:
        data.loc[data["byr"] == cohort, "real_earnings"] = (
            data.loc[data["byr"] == cohort, "real_earnings"] + addition)

    table = data.fillna(0)

    # drop groups where the weight sums to zero (i.e. where there are no positive earnings)
    sum_group_weights = (table.groupby(["race", "byr", "year", "eligible"
                                        ])["weights"].sum().to_frame())
    nonzero_weights_index = sum_group_weights.loc[
        sum_group_weights["weights"] != 0].index
    table.set_index(["race", "byr", "year", "eligible"], inplace=True)
    table = table.loc[nonzero_weights_index]
    # get weighted averages within by groups of ethnicity, cohort, year and draft eligibility
    data_temp = table.groupby(
        ["race", "byr", "year", "eligible"]).apply(lambda x: np.average(
            x[["real_earnings"]], weights=x["weights"], axis=0))
    data_temp = pd.DataFrame(data_temp.to_list(),
                             columns=["real_earnings"],
                             index=data_temp.index)

    # create dataframe with the differences in real earnings
    # for the above groups across eligibility
    difference = pd.DataFrame(index=data_temp.index, columns=["difference"])
    difference = difference.loc[(slice(None), slice(None), slice(None), 0), :]
    difference.reset_index("eligible", drop=True, inplace=True)
    difference["difference"] = (
        data_temp.loc[(slice(None), slice(None), slice(None), 1), :].values -
        data_temp.loc[(slice(None), slice(None), slice(None), 0), :].values)

    difference.reset_index("year", inplace=True)
    data_temp.reset_index("year", inplace=True)

    return data_temp, difference
Example #34
0
    s_last = str(last) if last >= 10 else '0{}'.format(last)
    if first != -1 and first != last and last > round(first + delta, 3):
        p.append('{}{}-{}{}'.format(letter, s_first, letter, s_last))
    elif first != -1 and first != last and last == round(first + delta, 3):
        p.append(letter + s_first)
        p.append(letter + s_last)
    elif first != -1:
        p.append(letter + s_first)
    return (p)


## #########################
## Run Program
## ##########################
## Determine uid_list. Set uid_list to the uid argument if it is an integer
oc = pd.read_stata(data_folder + '/unique_ids_and_codes.dta',
                   encoding='latin-1')
uid_list = [int(uid)] if '{}'.format(uid).isdigit() else set(
    oc['uniqid'].values.astype(int))

# Import data
input_dataset = pd.read_stata(data_folder + '/recalculation_input.dta',
                              encoding='latin-1')

## For each uniqid found in code_breakdowns.dta...
for uid in uid_list:
    print('\n uid: {}'.format(uid))

    ## add data for the given uid
    dt = input_dataset[input_dataset['uniqid'] == uid]
    colnames = [col for col in dt.columns if 'metric' in col]
    colnames.append('cause')
# In[3]:


import numpy as np
import matplotlib.pyplot as plt
get_ipython().run_line_magic('matplotlib', 'inline')
import pandas as pd
import statsmodels.api as sm
from statsmodels.iolib.summary2 import summary_col
from linearmodels.iv import IV2SLS


# In[4]:


df1 = pd.read_stata('https://github.com/QuantEcon/lecture-python/blob/master/source/_static/lecture_specific/ols/maketable1.dta?raw=true')
df1.head()


# In[5]:


plt.style.use('seaborn')

df1.plot(x='avexpr', y='logpgp95', kind='scatter')
plt.show()


# In[6]:

Example #36
0
    print("Null Values:")
    print(dataf.apply(lambda x: sum(x.isnull()) / len(dataf)))
##############################################################################
input_path = "/Users/christianhilscher/Desktop/dynsim/input/"
model_path = "/Users/christianhilscher/desktop/dynsim/src/estimation/models/04/"
output_path = "/Users/christianhilscher/Desktop/dynsim/output/"
estimation_path = "/Users/christianhilscher/desktop/dynsim/src/estimation/"
sim_path = "/Users/christianhilscher/desktop/dynsim/src/sim/"

os.chdir("/Users/christianhilscher/desktop/dynsim/src/data_preparation/")
from cleaning import SOEP_to_df
from data_prep import SOEP_to_df_old


# Making new dataset from original SOEP
df_pgen = pd.read_stata("/Volumes/B/soep.v35/STATA_DEEN_v35/Stata/pgen.dta")
df_hgen = pd.read_stata("/Volumes/B/soep.v35/STATA_DEEN_v35/Stata/hgen.dta")
df_ppathl = pd.read_stata("/Volumes/B/soep.v35/STATA_DEEN_v35/Stata/ppathl.dta")
df_hpathl = pd.read_stata("/Volumes/B/soep.v35/STATA_DEEN_v35/Stata/hpathl.dta")
df_hbrutto = pd.read_stata("/Volumes/B/soep.v35/STATA_DEEN_v35/Stata/hbrutto.dta")

df_pgen = df_pgen[["hid", "pid" , "syear", "pglabgro", "pgemplst", "pglfs", "pgtatzeit", "pgerwzeit", "pgpsbil", "pgfamstd"]]
df_hgen = df_hgen[["hid", "syear", "hgheat", "hgrent", "hgtyp1hh"]]


df_ppathl = df_ppathl[["hid", "pid", "syear", "sex", "gebjahr", "migback", "phrf"]]
df_hpathl = df_hpathl[["hid", "syear", "hhrf"]]
df_hbrutto = df_hbrutto[["hid", "syear", "bula"]]

# Merging datasets from SOEP
person_df = pd.merge(df_pgen, df_ppathl, on=["pid", "syear"], how="left")
    dq_utmCA = r"{}/output/dq_utmCA.dta".format(buildPath)
    # Property selection should only have one variable: sr_property_id
    dq_selection = ""

    # Temp files
    temp_select = "temp_select.shp"
    unified = "temp_unified.shp"
    dist_temp = "temp_top100_lines.csv"

    # California Transverse Mercator Projection
    # (http://spatialreference.org/ref/sr-org/7098/)
    # Also known as UTM Zone 10.5
    ctm = ctm_proj()

    # Now merge reprojected house locations with selection of houses
    dfproj = pd.read_stata(dq_utmCA)
    if dq_selection != "":
        dfselect = pd.read_stata(dq_selection)
        df = pd.merge(left=dfproj,right=dfselect, how='right',
            left_on='sr_property_id', right_on='sr_property_id')
    else:
        df = dfproj

    print('df read')

    # Reproject top 100
    reproject(top100_lines_raw,lines_utmCA,ctm)

    # Caluclate distances
    with fiona.open(lines_utmCA) as source:
        dataout_lines = dq_to_feats(df,source)
Example #38
0
# In[2]:

import pandas as pd

# ## Load the data.
#
# ---
#
# If this fails, be sure that you've saved your own data in the prescribed location, then retry.

# In[3]:

file = "../data/interim/1976data.dta"
# Matt Wilson converted the older Stata file to the one we use.
df_rawest = pd.read_stata(file)

# In[4]:

good_columns = [
    # Demographic
    'V763174',  # SUMMARY-R'S PARTY ID
    'V763796',  # OPIN:WHEN ALLOW ABORTION (1: never)
    'V763273',  # Private vs public insurance
    'V763241',  # GOVT GUAR JOB/S.L  (1: guarantee)
    'V763353',  # Gov't should spend less, even if cutting health and education.
    'V763264',  # MNRTY GRP AID SCL (1: help)
    #    'V763757',  # THE POOR ARE POOR BECAUSE THE AMERICAN WAY OF LIFE DOESN'T GIVE ALL PEOPLE AN EQUAL CHANCE? (1: agree)
]
df_raw = df_rawest[good_columns]
def lines_cases_per_million_not_log():

    # chart_width = 550
    chart_height = 400

    # hide
    data = pd.read_csv("static/csv/time_series_covid19_confirmed_global.csv", error_bad_lines=False)
    data = data.drop(columns=["Lat", "Long"])
    data = data.melt(id_vars=["Province/State", "Country/Region"])
    data = pd.DataFrame(data.groupby(['Country/Region', "variable"]).sum())
    data.reset_index(inplace=True)
    data = data.rename(columns={"Country/Region": "location", "variable": "date", "value": "total_cases"})
    data['date'] = pd.to_datetime(data.date)
    data = data.sort_values(by="date")
    data.loc[data.location == "US", "location"] = "United States"
    data.loc[data.location == "Korea, South", "location"] = "South Korea"

    data_pwt = pd.read_stata("https://www.rug.nl/ggdc/docs/pwt91.dta")

    filter1 = data_pwt["year"] == 2017
    data_pop = data_pwt[filter1]
    data_pop = data_pop[["country", "pop"]]
    data_pop.loc[data_pop.country == "Republic of Korea", "country"] = "South Korea"
    data_pop.loc[data_pop.country == "Iran (Islamic Republic of)", "country"] = "Iran"

    # per habitant
    data_pc = data.copy()
    countries = ["China", "Italy", "Spain", "France", "United Kingdom", "Germany",
                 "Portugal", "United States", "Singapore", "South Korea", "Japan",
                 "Brazil", "Iran", 'Netherlands', 'Belgium', 'Sweden',
                 'Switzerland', 'Norway', 'Denmark', 'Austria', 'Slovenia', 'Greece',
                 'Cyprus']
    data_countries = []
    data_countries_pc = []

    MIN_DEATHS = 10
    filter_min_dead = data_pc.total_cases < MIN_DEATHS
    data_pc = data_pc.drop(data_pc[filter_min_dead].index)

    # compute per habitant
    for i in countries:
        data_pc.loc[data_pc.location == i, "total_cases"] = data_pc.loc[data_pc.location == i, "total_cases"] / float(
            data_pop.loc[data_pop.country == i, "pop"])

        # get each country time series
    filter1 = data_pc["total_cases"] > 1
    for i in countries:
        filter_country = data_pc["location"] == i
        data_countries_pc.append(data_pc[filter_country & filter1])

    # hide_input
    # Stack data to get it to Altair dataframe format
    data_countries_pc2 = data_countries_pc.copy()
    for i in range(0, len(countries)):
        data_countries_pc2[i] = data_countries_pc2[i].reset_index()
        data_countries_pc2[i]['n_days'] = data_countries_pc2[i].index
        data_countries_pc2[i]['log_cases'] = np.log10(data_countries_pc2[i]["total_cases"])
    data_plot = data_countries_pc2[0]
    for i in range(1, len(countries)):
        data_plot = pd.concat([data_plot, data_countries_pc2[i]], axis=0)
    data_plot["trend_4days"] = np.log10(2) / 4 * data_plot["n_days"]
    data_plot["trend_12days"] = np.log10(2) / 12 * data_plot["n_days"]
    data_plot["trend_4days_label"] = "Doubles evey 4 days"
    data_plot["trend_12days_label"] = "Doubles every 12 days"

    # Plot it using Altair
    source = data_plot

    fig = px.line(data_plot, x="n_days", y="total_cases", color='location')

    fig.update_layout(xaxis_title='Day Since first infection',
                      yaxis_title='#cases/million')
    return fig
def ScrapeIslands(self, backUpFile='backUpFile.csv'):
    inputFile = self.infile

    if inputFile[-3:] == 'csv':
        remainingPids = pd.read_csv(inputFile).filter(['pid'])
    elif inputFile[-3:] == 'dta':
        remainingPids = pd.read_stata(inputFile).filter(['pid'])
    elif inputFile[-3:] == 'lsx':
        remainingPids = pd.read_excel(inputFile).filter(['pid'])

    t0 = time.time()
    islandID = 0

    total = remainingPids.count()[0]
    finalcsv = pd.DataFrame()
    finished = False

    print('Starting scrape')
    while (not (finished)):
        pid = remainingPids.iloc[0, remainingPids.columns.get_loc('pid')]
        #pid is inserted 3 times because scrapeperson often breaks if only fed one pid
        temp = pd.DataFrame(data={'pid': [pid, pid, pid]})
        size = temp.drop_duplicates().count()[0]
        completeIsland = temp.drop_duplicates()
        growing = True
        try:
            while (growing):
                temp.to_csv(r'in.csv')
                finaldf = pd.DataFrame()

                with suppress_stdout():
                    try:
                        self.ScrapePerson()
                    except:
                        print("Invalid pid " + pid)
                outFile = pd.read_csv('out.csv',
                                      usecols=[
                                          'FSID', 'spouse_FSID', 'kid_FSIDs',
                                          'dad_FSIDs', 'mom_FSIDs'
                                      ])

                #only used on the first pass, gets rid of extra pids needed to run scrapeperson
                if size == 1:
                    outFile = outFile.drop_duplicates()

                outFile = outFile.rename(
                    columns={
                        'FSID': 'pid',
                        'spouse_FSID': 'spouse',
                        'kid_FSIDs': 'kids',
                        'dad_FSIDs': 'dad',
                        'mom_FSIDs': 'mom'
                    })
                #do by column, rename pid, merge and drop duplicates
                counts = outFile.count()

                if counts.get('spouse') > 0:
                    pidSpouse = \
                    (outFile.set_index(outFile.columns.drop('spouse',1).tolist())
                       .spouse.str.split(';', expand=True)
                       .stack()
                       .reset_index()
                       .rename(columns={0:'spouse'})
                       .loc[:, outFile.columns]
                    )
                    pidSpouse = pidSpouse.filter(['spouse'
                                                  ]).rename(columns={
                                                      'spouse': 'pid'
                                                  }).drop_duplicates()
                    finaldf = finaldf.append(pidSpouse, ignore_index=True)

                if counts.get('kids') > 0:
                    pidKids = \
                    (outFile.set_index(outFile.columns.drop('kids',1).tolist())
                       .kids.str.split(';', expand=True)
                       .stack()
                       .reset_index()
                       .rename(columns={0:'kids'})
                       .loc[:, outFile.columns]
                    )
                    pidKids = pidKids.filter(['kids']).rename(columns={
                        'kids': 'pid'
                    }).drop_duplicates()
                    finaldf = finaldf.append(pidKids, ignore_index=True)

                if counts.get('dad') > 0:
                    pidDad = \
                    (outFile.set_index(outFile.columns.drop('dad',1).tolist())
                       .dad.str.split(';', expand=True)
                       .stack()
                       .reset_index()
                       .rename(columns={0:'dad'})
                       .loc[:, outFile.columns]
                    )
                    pidDad = pidDad.filter(['dad']).rename(columns={
                        'dad': 'pid'
                    }).drop_duplicates()
                    finaldf = finaldf.append(pidDad, ignore_index=True)

                if counts.get('mom') > 0:
                    pidMom = \
                    (outFile.set_index(outFile.columns.drop('mom',1).tolist())
                       .mom.str.split(';', expand=True)
                       .stack()
                       .reset_index()
                       .rename(columns={0:'mom'})
                       .loc[:, outFile.columns]
                    )
                    pidMom = pidMom.filter(['mom']).rename(columns={
                        'mom': 'pid'
                    }).drop_duplicates()
                    finaldf = finaldf.append(pidMom, ignore_index=True)

                #drops any duplicate pids found, will happen more on later generations
                finaldf = finaldf.drop_duplicates()

                #insures that scrapeperson always has more than one pid to run on
                try:
                    if finaldf.count()[0] > 0 and finaldf.count()[0] < 3:
                        finaldf = finaldf.append(finaldf, ignore_index=True)
                        #print(finaldf)
                except:
                    print("Single household")

                #appends pids found to the list
                completeIsland = completeIsland.append(
                    finaldf, ignore_index=True).drop_duplicates()
                totalSize = completeIsland.count()[0]
                if size == totalSize or totalSize > 1500:
                    growing = False
                    completeIsland['islandID'] = islandID
                    completeIsland['Count'] = totalSize
                    print("Number of people found: " + str(totalSize))
                else:
                    size = totalSize
                    temp = finaldf
                    if size > 9:
                        print("Found " + str(size) + " in the island so far")
        #something broke so it saves the pids remaining and exports what it has gathered so far
        except:
            try:
                print(sys.exc_info()[0])
                sys.exc_info()[2].traceback.print_exception(limit=5)
            except:
                pass
            remainingPids.to_csv(backUpFile, index=False)
            break

        finalcsv = finalcsv.append(completeIsland, ignore_index=True)
        remainingPids = remainingPids[~remainingPids['pid'].
                                      isin(completeIsland['pid'])]
        leftover = remainingPids.count()[0]
        print(str(leftover) + ' people left of ' + str(total))

        islandID += 1
        if islandID % 25 == 0:
            print()
            t2 = time.time()
            avg_time = (t2 - t0) / islandID
            print('Avg Time: ' + str(round(avg_time, 3)) + ' Seconds/Island')
            print('Est. time remaining: ' +
                  str(round(avg_time * leftover / 3 / 60, 3)) + ' minutes')

        print()
        if leftover == 0:
            finished = True

    print("Finished!")
    return finalcsv
Example #41
0
import os
import pandas as pd

import python.datafiles as datafiles

data = pd.read_csv("data/workdata_2016.dta")
data = pd.read_csv("data/workdata_2016.dta", encoding="iso-8859-1",error_bad_lines=False)
data = pd.read_csv("data/workdata_2016.dta", encoding="utf-8")
data = pd.read_csv('file1.csv', 

data = pd.read_stata("data/workdata_2016.dta", encoding="ascii")
data = pd.read_stata("data/workdata_2016.dta", encoding="latin-1")
# data_recip_1.to_csv(   folder + "recip-1/"   + name + '.csv')
Example #42
0
#!/usr/bin/env python3
# -*- coding: utf-8 -*-
"""
Created on Mon Dec 18 14:12:49 2017

@author: pradeep
"""

import numpy as np
import pandas as pd
import matplotlib as plt

cluster_dt = pd.read_stata(
    '/Users/pradeep/Google Drive ([email protected])/Cluster Paper/Data/Data_MAin_v12.dta'
)
cluster_dt.to_pickle('cluster_dt.pkl')
cluster_dt = pd.read_pickle('cluster_dt.pkl')

#pprint(cluster_dt.columns)
#print(cluster_dt.columns)
#print(pd.get_option("display.max_columns"))
#print(pd.set_option("display.max_columns",999))
#pd.set_option('display.max_columns', None)
#cluster_dt.columns
#pandas.set_option('display.max_rows', None)

cols = list(cluster_dt.columns)

df1 = cluster_dt[[
    'Company_Name', 'City', 'PreviousLocation_F1', 'PreviousLocation_F2',
    'PreviousLocation_F3', 'PreviousLocation_F4',
Example #43
0
# IPPP Final Project - Regression Analysis
### Long-term Health Impact ###
# outcome vars are: sick(dummy for person sick), days sick,

import pandas as pd
import numpy as np
import statsmodels.api as sm

#read investment data
investments = pd.DataFrame(
    pd.read_stata("investments_data.dta", convert_categoricals=False))

#mask wave 7 to get 2003 data and drop obvs with no consumption data
investments = investments.loc[(investments['wave'] == 7)
                              & (investments['consumo_pp_ae'] != 0)]
investments.dropna(subset=['consumo_pp_ae'], inplace=True)

#mask out 1 (poorest hh) ntile and 200 (richest hh) ntile of consumption
iinvestments = investments.loc[(investments['consumo_pp_ae2'] > 22.2066)
                               & (investments['consumo_pp_ae2'] < 1118.53)]

#read morbidity data
morbidity = pd.DataFrame(
    pd.read_stata("adults_morbidity_03.dta", convert_categoricals=False))

# create age squared
morbidity['age_sq'] = morbidity['age']**2

#sorting the columns for joining
morbidity.sort_values(by=['folio', 'state', 'muni', 'local', 'wave'],
                      ascending=[True, True, True, True, True])
def gen_var(agegroup_type_Bailey = True):
    """Function to create the relevant categorials from scratch.
    
        Args:
        -------
        
        Returns:
        -------
        A data frame containing the variables needed for the analysis.
    """
    wn.filterwarnings("ignore")
    #1. 1955 GAF
    #Read dta file 1955 GAF
    
    df_55 = pd.read_stata('data/gaf55_1.dta', convert_categoricals = False)
    
    #generate dummy: 1 if fertile and 0 if not
    df_55['_fecund'] = 0
    df_55.loc[ (df_55['fec_code'] == 4) | (df_55['fec_code'] == 5), '_fecund' ] = 1
    
    #generate dummy if ever used method of contraception
    df_55['_everuse'] = 0
    df_55.loc[ (df_55['meth_everuse'] == 1), '_everuse' ] = 1
    
    #generate dummy if ever used method of contraception or as douche
    df_55['_everuse_d'] = 0
    df_55.loc[(df_55['meth_everuse'] == 1) | (df_55['meth_everuse'] == 2), '_everuse_d'] = 1
    
    #generate dummy if ever used condom or diaphragma
    df_55['_barrier'] = 0
    df_55.loc[ (df_55['con_use'] == 1) | (df_55['dia_use'] == 1), '_barrier'] = 1
    
    #generate dummy whether either condom, diaphragma, helly, vagsupp, foamtab,
    #tampon or any other method was ever used
    df_55['_supplies'] = 0
    df_55.loc[ (df_55['con_use']==1)|(df_55['dia_use']==1)|(df_55['jelly_use']==1)|(df_55['vagsupp_use']==1) \
                    |(df_55['foamtab_use']==1)|(df_55['tampon_use']==1)|(df_55['othmeth_use']==1), '_supplies'] = 1
    
    #generate dummy, if ever used oral contraception (not possible in 1955)
    df_55['_oral'] = 0   
    
    #generate dummy wgt
    df_55['wgt'] = 1 
    
    #generate year
    df_55['year'] = 1955
    
    
    #2. 1965 NFS
        #read .dta file 1965 NFS
    df_65 = pd.read_stata('data/nfs65_1.dta', convert_categoricals = False)
    
        #generate dummy for fertility, one means fertile
    df_65['_fecund'] = 0
    df_65.loc[ (df_65['fecund'] == 8), '_fecund'] = 1
    
        #generate dummy if ever used method of contraception or as douche
    df_65['_everuse_d'] = 0
    df_65.loc[(df_65['preg1_meth1_4'] == 1) | (df_65['preg1_meth1_5'] == 1)\
                       | (df_65['preg1_meth1_6'] == 1) | (df_65['preg1_meth1_7']==1) \
                       | (df_65['preg1_meth1_8']==1)   |(df_65['preg1_meth1_12']==1) | (df_65['preg1_meth1_9']==1) \
                       | (df_65['preg1_meth1_10']==1) | (df_65['preg1_meth1_11']==1) |(df_65['preg1_meth1_13']==1) \
                       | (df_65['preg1_meth1_14']==1)|(df_65['preg1_meth1_15']==1) | (df_65['preg1_meth1_16']==1), '_everuse_d'] = 1
    
        #generate dummy whether a contraception method was used before the first pregnancy
    df_65['_everuse2_mjb'] = 0
    df_65.loc[df_65['preg1_meth1_3'] == 0, '_everuse2_mjb'] = 1 #1 if not used before first pregnancy
    df_65.loc[df_65['preg1_meth1_2'] == 1, '_everuse2_mjb'] = np.NaN # no answer to question of contraception used before first pregnancy
    
        #generate dummy if ever used method of contraception (exclusive douche)
    df_65['_everuse3_mjb'] = df_65['_everuse_d']
    df_65.loc[ (df_65['preg1_meth1_13']==1) &  (df_65['preg1_meth1_1']==0) & (df_65['preg1_meth1_2']==0) \
                           & (df_65['preg1_meth1_3']==0) & (df_65['preg1_meth1_4']==0) & (df_65['preg1_meth1_5']==0) \
                            & (df_65['preg1_meth1_6']==0) & (df_65['preg1_meth1_7']==0) & (df_65['preg1_meth1_8']==0) \
                        & (df_65['preg1_meth1_9']==0) & (df_65['preg1_meth1_10']==0) & (df_65['preg1_meth1_11']==0) \
                        & (df_65['preg1_meth1_12']==0) & (df_65['preg1_meth1_14']==0) & (df_65['preg1_meth1_15']==0) \
                           & (df_65['preg1_meth1_16'] == 0), '_everuse3_mjb' ] = 0
    
        #genrate dummy if a barrier method (condom, diaphragma) was ever used
    df_65['_barrier'] = 0
    df_65.loc[ (df_65['preg1_meth1_5'] == 1) | (df_65['preg1_meth1_7'] == 1), '_barrier' ] = 1
    
        #generate dummy if any supply methods were ever used (condoms, diaphragma, pill, jelly, foam, douche, IUD, Sponge )
    df_65['_supplies'] = 0
    df_65.loc[ (df_65['preg1_meth1_5'] == 1) | (df_65['preg1_meth1_7'] == 1)|(df_65['preg1_meth1_8'] == 1) \
                       | (df_65['preg1_meth1_9'] == 1) |  (df_65['preg1_meth1_10'] == 1) | (df_65['preg1_meth1_11']==1) \
                       | (df_65['preg1_meth1_13']==1)   |(df_65['preg1_meth1_14']==1) | (df_65['preg1_meth1_16']==1), '_supplies' ] = 1   
    
        #generate dummy if oral contraception was used (pill)
    df_65['_oral'] = 0
    df_65.loc[ df_65['preg1_meth1_8'] == 1, '_oral' ] = 1
    
    
    #repeat this for any different A-types of pregi_meth1_j
    for i in range(0,21):
        df_65.loc[ (df_65['preg{}_meth1_4'.format(i)] == 1) | (df_65['preg{}_meth1_5'.format(i)] == 1) \
                       | (df_65['preg{}_meth1_6'.format(i)] == 1) | (df_65['preg{}_meth1_7'.format(i)]==1) \
                       | (df_65['preg{}_meth1_8'.format(i)]==1)   |(df_65['preg{}_meth1_12'.format(i)]==1) | (df_65['preg{}_meth1_9'.format(i)]==1) \
                       | (df_65['preg{}_meth1_10'.format(i)]==1) | (df_65['preg{}_meth1_11'.format(i)]==1) |(df_65['preg{}_meth1_13'.format(i)]==1) \
                       | (df_65['preg{}_meth1_14'.format(i)]==1)|(df_65['preg{}_meth1_15'.format(i)]==1) | (df_65['preg{}_meth1_16'.format(i)]==1), '_everuse_d'] = 1
        
        df_65.loc[df_65['preg{}_meth1_3'.format(i)] == 0, '_everuse2_mjb'] = 1    
        
        df_65['_everuse3_mjb'] = df_65 ['_everuse_d']   
        df_65.loc[ (df_65['preg{}_meth1_13'.format(i)]==1) &  (df_65['preg{}_meth1_1'.format(i)]==0) & (df_65['preg{}_meth1_2'.format(i)]==0) \
                       & (df_65['preg{}_meth1_3'.format(i)]==0) & (df_65['preg{}_meth1_4'.format(i)]==0) & (df_65['preg{}_meth1_5'.format(i)]==0) \
                       & (df_65['preg{}_meth1_6'.format(i)]==0) & (df_65['preg{}_meth1_7'.format(i)]==0) & (df_65['preg{}_meth1_8'.format(i)]==0) \
                       & (df_65['preg{}_meth1_9'.format(i)]==0) & (df_65['preg{}_meth1_10'.format(i)]==0) & (df_65['preg{}_meth1_11'.format(i)]==0) \
                       & (df_65['preg{}_meth1_12'.format(i)]==0) & (df_65['preg{}_meth1_14'.format(i)]==0) & (df_65['preg{}_meth1_15'.format(i)]==0) \
                       & (df_65['preg{}_meth1_16'.format(i)] == 0), '_everuse3_mjb' ] = 0   
    
        df_65.loc[ (df_65['preg{}_meth1_5'.format(i)] == 1) | (df_65['preg{}_meth1_7'.format(i)] == 1), '_barrier' ] = 1
    
        df_65.loc[ (df_65['preg{}_meth1_5'.format(i)] == 1) | (df_65['preg{}_meth1_7'.format(i)] == 1)|(df_65['preg{}_meth1_8'.format(i)] == 1) \
                       | (df_65['preg{}_meth1_9'.format(i)] == 1) |  (df_65['preg{}_meth1_10'.format(i)] == 1) | (df_65['preg{}_meth1_11'.format(i)]==1) \
                       | (df_65['preg{}_meth1_13'.format(i)]==1)   |(df_65['preg{}_meth1_14'.format(i)]==1) | (df_65['preg{}_meth1_16'.format(i)]==1), '_supplies' ] = 1 
    
        df_65.loc[ df_65['preg{}_meth1_8'.format(i)] == 1, '_oral' ] = 1
        #print(i)     
    
    
    df_65 = df_65[['_barrier', '_supplies', 'preg0_coitfr', '_oral', '_fecund', '_everuse_d','_everuse2_mjb','_everuse3_mjb', \
                'H_emp_inc65', 'fam_idealch', 'att_idealch', 'res_popdens', 'res_reg', 'statefip', 'statename', 'int_num', 'per_race', \
                '_age', 'ed_higrade', 'rel_pref', 'dob', 'fec_sterilop1', 'wgt', 'ch_totlb']]
    df_65['year'] = 1965
    
    
    #3. 1970 NFS
        #read .dta file 1970 NFS
    df_70 = pd.read_stata('data/nfs70_1.dta', convert_categoricals = False)
    
        #generate dummy if ever used method of contraception or as douche
    df_70['_everuse_d'] = 0
    df_70.loc[ (df_70['preg1_meth_any'] == 1) | (df_70['dia_everuse'] == 1) | (df_70['iud_everuse'] == 1) \
                        | (df_70['_bcp_everuse'] == 1), '_everuse_d'] = 1
    
        #genrate dummy if a barrier method (condom, diaphragma) was ever used
    df_70['_barrier'] = 0
    df_70.loc[ (df_70['preg1_meth_1'] == 3) | (df_70['preg1_meth_1'] == 5) | (df_70['dia_everuse'] == 1), '_barrier'  ] = 1
    
        #generate dummy if any supply methods were ever used (condoms, diaphragma, pill, jelly, foam, douche, IUD, Sponge )
    df_70['_supplies'] = 0
    df_70.loc[ (df_70['preg1_meth_1'] == 3) | (df_70['preg1_meth_1'] == 5) | (df_70['preg1_meth_1'] == 6)   \
                       | (df_70['preg1_meth_1'] == 7) | (df_70['preg1_meth_1'] == 8) | (df_70['preg1_meth_1'] == 9) \
                       | (df_70['preg1_meth_1'] == 10) | (df_70['preg1_meth_1'] == 11) | (df_70['preg1_meth_1'] == 13) \
                       | (df_70['dia_everuse'] == 1) | (df_70['iud_everuse'] == 1) | (df_70['_bcp_everuse'] == 1), '_supplies'] = 1
    
        #generate dummy if oral contraception was used (pill)
    df_70['_oral'] = 0
    df_70.loc[ (df_70['preg1_meth_1'] == 6) | (df_70['_bcp_everuse'] == 1), '_oral' ] = 1
    
        # rename variable to make it suitable for loop
    df_70 = df_70.rename(columns={'preg0_meth': 'preg0_meth_1'})
    #df_70['preg0_meth_1'] = df_70['preg0_meth']
    
    for i in range(0,20):
        
        if i == 0 :
          j = 1
          df_70.loc[ df_70['preg0_meth1'] == 1, '_everuse_d' ] = 1  
          df_70.loc[ (df_70['preg0_meth_{}'.format(j)] == 3) | (df_70['preg0_meth_{}'.format(j)] == 5), '_barrier'] = 1
          df_70.loc[ (df_70['preg0_meth_{}'.format(j)] == 3) | (df_70['preg0_meth_{}'.format(j)] == 5) | (df_70['preg0_meth_{}'.format(j)] == 6)   \
                       | (df_70['preg0_meth_{}'.format(j)] == 7) | (df_70['preg0_meth_{}'.format(j)] == 8) | (df_70['preg0_meth_{}'.format(j)] == 9) \
                       | (df_70['preg0_meth_{}'.format(j)] == 10) | (df_70['preg0_meth_{}'.format(j)] == 11) | (df_70['preg0_meth_{}'.format(j)] == 13), '_supplies' ] = 1
          
          df_70.loc[ (df_70['preg0_meth_{}'.format(j)] == 6), '_oral' ] = 1
        else:
         df_70.loc[ (df_70['preg{}_meth_any'.format(i)] == 1), '_everuse_d' ] = 1  
        
         for j in range(1,7):
              df_70.loc[ (df_70['preg{}_meth_{}'.format(i,j)] == 3) | (df_70['preg{}_meth_{}'.format(i,j)] == 5), '_barrier'] = 1
              df_70.loc[ (df_70['preg{}_meth_{}'.format(i,j)] == 3) | (df_70['preg{}_meth_{}'.format(i,j)] == 5) | (df_70['preg{}_meth_{}'.format(i,j)] == 6)   \
                       | (df_70['preg{}_meth_{}'.format(i,j)] == 7) | (df_70['preg{}_meth_{}'.format(i,j)] == 8) | (df_70['preg{}_meth_{}'.format(i,j)] == 9) \
                       | (df_70['preg{}_meth_{}'.format(i,j)] == 10) | (df_70['preg{}_meth_{}'.format(i,j)] == 11) | (df_70['preg{}_meth_{}'.format(i,j)] == 13), '_supplies' ] = 1
              df_70.loc[df_70['preg{}_meth_{}'.format(i,j)] == 6, '_oral'] = 1
        #print(i)
         
    
    df_70 = df_70[['_barrier', '_supplies', '_oral', 'preg0_coitfr', '_everuse_d', 'H_emp_inc70', 'fam_idealch', 'att_idealch', 
                   'statefip', 'statename', 'int_LAN', 'int_LAN_size', 'int_LAN_region', 'int_num', 'per_race', 'wgt', '_age',
                   'mar_stat', 'ed_total', 'rel_pref', 'dob', 'fec_sterilop1', 'ch_totlb' ]]    
         
    df_70['year'] = 1970    
         
    
        #append data frames
    df = df_55
    df = df.append(df_65, ignore_index = True) 
    df = df.append(df_70, ignore_index = True)
    
    
    #4. Harmonize definitions of sample variables 
        #Race
    df['_White'] = 0
    df.loc[df['year'] == 1955, '_White'] = 1 
    df.loc[(df['year'] == 1965) & (df['per_race'] == 1), '_White'] = 1 
    df.loc[(df['year'] == 1970) & (df['per_race'] == 2), '_White'] = 1
    
    
        #Marital Status
    df['_Married'] = 0
    df.loc[(df['year'] == 1955) | (df['year'] == 1965), '_Married'] = 1
    df.loc[(df['year'] == 1970) & (df['mar_stat'] == 1), '_Married'] = 1
    
    
        #Education
    df['_ed_cat'] = np.NaN 
    lis = [0,9,12,13,16,18] 
    df.loc[(df['year'] == 1965) & (df['ed_higrade'] == 0), '_ed_cat'] = 0
    
    for i in range(1,len(lis)):
        #print(i)
        df.loc[ (df['year'] == 1965) & ( lis[i-1] <= df['ed_higrade'] ) & (df['ed_higrade'] < lis[i]), '_ed_cat' ] = lis[i-1]
        df.loc[ (df['ed_total'] == i), 'ed_total'] = lis[i-1]
    
    df.loc[ (df['year'] == 1970), '_ed_cat'] = df['ed_total']
    df.loc[  (df['ed'] <= 4) & (df['ed'] >= 1), 'ed'] = 0
    df.loc[  (df['ed'] == 8) | (df['ed'] == 9),'ed'] = 16
    df.loc[  (df['ed'] == 5), 'ed'] = 9
    df.loc[  (df['ed'] == 6), 'ed'] = 12
    df.loc[  (df['ed'] == 7), 'ed'] = 13
    df.loc[  (df['year'] == 1955), '_ed_cat'] = df['ed']    
    
        #Catholic
    df['_Catholic'] = 0
    df.loc[ ((df['year'] == 1955) & (df['rel_pref'] == 10)) | ((df['year'] == 1965) & (df['rel_pref'] == 21)) | \
                    ((df['year'] == 1970) & (df['rel_pref'] == 21)), '_Catholic'] = 1
        
        #Birth Cohort
    df['_yob'] = np.NaN    
    lis = range(0,870,12)
    for i in lis:
      df.loc[ ((df['year'] == 1970) | (df['year'] == 1965)) & (df['dob'] >=  i) & (df['dob'] <  i + 12), '_yob'] = i /12 + 1900
        
    df.loc[ ((df['year'] == 1970) | (df['year'] == 1965)) & (df['dob'] >  900), '_yob'] = df['dob'] + 1000   
    df.loc[ (df['year'] == 1955), '_yob' ] = df['dob_y'] + 1900     
    
    df['_yobcat'] = np.NaN  
    lis = range(1910,1960, 5)
    for i in lis:
      df.loc[  (df['_yob'] >=  i) & (df['_yob'] <  i + 5), '_yobcat'] = i  
        
        #age    
    df['_age'] = np.NaN
    df['_age'] = df['year'] - df['_yob']
    df['_agecat'] = np.NaN
    lis = range(15,60,5)    
    for i in lis:
      df.loc[  (df['_age'] >=  i) & (df['_age'] <  i + 5), '_agecat'] = i  
        
       #Surgically sterilized 
    df['_sterilop'] = 0
    df.loc[  (df['year'] == 1955) & (df['fec_ster1'] !=  0) & (df['fec_ster1'] !=  6), '_sterilop'  ] = 1 
    df.loc[  (df['year'] == 1965) & (df['fec_sterilop1'] ==  1), '_sterilop'  ] = 1  
    df.loc[ (df['year'] == 1970) & (df['fec_sterilop1'] ==  1), '_sterilop'  ] = 1  
    
        
       #ideal number of children for american family
    df['_att_idealch'] = np.NaN #create variable to standardize variables.
    df.loc[(df['att_idealch'] < 99) & (df['year'] == 1955), '_att_idealch'] = df['att_idealch']/10    
    df.loc[ (df['att_idealch'] > 90) & (df['att_idealch'] <= 130) & (df['year'] == 1965), 'att_idealch' ] = 90 #fix coding bug 
    df.loc[ ((df['att_idealch'] <= 90) & (df['year'] == 1965)) \
                       | ((df['att_idealch'] < 96) & (df['year'] == 1970)), '_att_idealch'] = df['att_idealch']/10  
    
    df['_idealcat'] = np.NaN  
    lis = [0,2,3,4,5,10,1000] #1000 can be an arbitrary high value. Just to make loop suitable
    for i in range(0, (len(lis)-1)):
        #print(i)
        df.loc[ (lis[i] <= df['_att_idealch']) & (df['_att_idealch'] < lis[i+1]), '_idealcat'  ] = lis[i]
    
        #Husbands income
    df['h_empinc65'] = np.NaN
    df.loc[ (df['year'] == 1965), 'h_empinc65' ] = df['H_emp_inc65'] 
    df.loc[ (df['h_empinc65'] != 0) & (df['h_empinc65'] < 10), 'h_empinc65' ] = df['h_empinc65']*1000+500 
    df.loc[(df['h_empinc65'] == 10), 'h_empinc65' ] = 11000
    df.loc[(df['h_empinc65'] == 20), 'h_empinc65' ] = 13500
    df.loc[ (df['h_empinc65'] == 30), 'h_empinc65' ] = 21000
    df.loc[(df['h_empinc65'] == 88) | (df['h_empinc65'] == 99), 'h_empinc65' ] = np.NaN
    
    df['h_empinc70'] = np.NaN
    df.loc[(df['year'] == 1970), 'h_empinc70' ] = df['H_emp_inc70'] 
    df.loc[ (df['h_empinc70'] != 0) & (df['h_empinc70'] < 10), 'h_empinc70' ] = df['h_empinc70']*1000+500 
    df.loc[ (df['h_empinc70'] == 10), 'h_empinc70' ] = 12000
    df.loc[ (df['h_empinc70'] == 11), 'h_empinc70' ] = 13500
    df.loc[ (df['h_empinc70'] == 12), 'h_empinc70' ] = 21000
    df.loc[ (df['h_empinc70'] == 88) | (df['h_empinc70'] == 99), 'h_empinc70' ] = np.NaN
    
    df['h_empinc55'] = np.NaN
    df.loc[ (df['year'] == 1955), 'h_empinc55' ] = df['h_emp_inc'] 
    df.loc[ (df['h_empinc55'] != 0) & (df['h_empinc55'] < 9), 'h_empinc55' ] = (df['h_empinc55']-1)*1000+500 
    df.loc[ (df['h_empinc55'] == 9), 'h_empinc55' ] = 9000
    df.loc[ (df['h_empinc55'] == 10), 'h_empinc55' ] = 14000
    df.loc[ (df['h_empinc55'] == 88) | (df['h_empinc55'] == 99), 'h_empinc55' ] = np.NaN
    
    df['_h_empinc'] = np.NaN
    df.loc[(df['year'] == 1955), '_h_empinc'] = (36.7/26.9)*df['h_empinc55'] #in 1969 US Dollar
    df.loc[(df['year'] == 1965), '_h_empinc'] = (36.7/31)*df['h_empinc65'] #in 1969 US Dollar
    df.loc[(df['year'] == 1970), '_h_empinc'] = df['h_empinc70'] #in 1969 US Dollar
    
    df['_hinccat'] = np.NaN
    df.loc[(0 <=df['_h_empinc']) & (df['_h_empinc'] <= 4500), '_hinccat'] = 0
    df.loc[(4500 < df['_h_empinc']) & (df['_h_empinc'] <= 6500), '_hinccat'] = 1
    df.loc[(6500 < df['_h_empinc']) & (df['_h_empinc'] <= 8400), '_hinccat'] = 2
    df.loc[ (8400 < df['_h_empinc']) & (df['_h_empinc'] <= 12000), '_hinccat'] = 3
    df.loc[(12000 <= df['_h_empinc']) & (df['_h_empinc'] <= 25000), '_hinccat'] = 4
    
        #Attitudes on family planning
    df['_approvefp55'] = np.NaN
    df.loc[(df['year'] == 1955), '_approvefp55'] = 0
    df.loc[(df['year'] == 1955) & (df['att_famplan'] <= 2), '_approvefp55' ] = 1
    
        #Region
    df.loc[( (df['int_LAN_region'] == 1) | (df['int_LAN_region'] == 2) ) & (df['year'] == 1970), 'res_reg' ] = 1
    df.loc[( (df['int_LAN_region'] == 3) | (df['int_LAN_region'] == 4) ) & (df['year'] == 1970), 'res_reg' ] = 2
    df.loc[( (df['int_LAN_region'] == 5) | (df['int_LAN_region'] == 6) | (df['int_LAN_region'] == 7) ) & (df['year'] == 1970), 'res_reg' ] = 3
    df.loc[( (df['int_LAN_region'] == 8) | (df['int_LAN_region'] == 9) ) & (df['year'] == 1970), 'res_reg' ] = 4
    df.loc[(df['int_LAN'] >= 54002) & (df['int_LAN'] <= 54013)  & (df['year'] == 1970), 'res_reg' ] = 1
    df.loc[ (df['statefip'] == 11) & (df['year'] == 1970), 'res_reg' ] = 3
    df.loc[ (df['year'] == 1955), 'res_reg' ] = df['_region']
    df = df.drop(columns=['_region'])
    df = df.rename(columns={'res_reg': '_region'}) #see codebokkNFS65 
    
        #Live births
    df.loc[ df['ch_totlb'] > 12, 'ch_totlb' ] = 12
    
        #coital frequency
    df['_coit_freq'] = np.NaN
    df.loc[df['preg0_coitfr'] < 98, '_coit_freq'] = df['preg0_coitfr']
    
    
    
    #5. Merge to Statenames sice some states have the name with a space ' ' in th beginning
    sn = pd.read_stata('data/temp.dta', convert_categoricals = False)
    lis = sn['statefip']
    for i in lis:
        L = len(df['statename'][df['statefip'] == i]) 
        df['statename'][ df['statefip'] == i] = np.repeat(sn['statename'][ sn['statefip'] == i], L)
        #df.loc[ df['statefip'] == i, 'statename'] = np.repeat(sn['statename'][ sn['statefip'] == i], L)
        
    #6. Merge to Griswold Laws
    gl = pd.read_stata('data/griswoldlaws5.dta', convert_categoricals = False)    
    
    df = df[df.statefip != 11] #delete columbia
    df['sales'] = np.NaN
    df['any'] = np.NaN
    df['_Phys'] = np.NaN
    df['cstck'] = np.NaN
    df['_Phy_LB'] = np.NaN
    
    lis = gl['statefip']
    indexcol = ['sales', 'any', '_Phys', 'cstck', '_Phy_LB']
    for column in indexcol:
        #print(column)
        for i in lis:
            L = len(df[column][df['statefip'] == i]) 
            df[column][df['statefip'] == i] = np.repeat(gl[column][ gl['statefip'] == i], L)
            
    df['__yobcat'] = np.NaN
    if agegroup_type_Bailey is True:
        lis = range(1910,1950,20)
    else:
        lis = range(1910,1960,20)
    
    for i in lis:
        #print(i)
        df.loc[ (df['_yobcat'] >=  i) & (df['_yobcat'] <  i + 20), '__yobcat'] = i  #maybe error in stata code (values above 1950 are set as missing)
    
    return df 
Example #45
0
import pandas as pd
import numpy as np
import pickle as pk
import math
import os

anes_data_inp=pd.read_stata("data/ANES_raw2008-1948.dta",convert_categoricals=False)
circuit_mapping={
	23:1, 25:1, 33:1, 44:1,
	9:2, 36:2, 50:2,
	10:3, 34:3, 42:3,
	24:4, 37:4, 45:4, 51:4, 54:4,
	22:5, 28:5, 48:5,
	21:6, 26:6, 39:6, 47:6,
	17:7, 18:7, 55:7,
	5:8, 19:8, 27:8, 29:8, 31:8, 38:8, 46:8,
	2:9, 4:9, 6:9, 15:9, 16:9, 30:9, 32:9, 41:9, 53:9,
	8:10, 20:10, 35:10, 40:10, 49:10, 56:10,
	1:11, 12:11, 13:11,
	11:12
}

# Below is the sorted values of the state ID's in the ANES
# unique = anes_data_inp['VCF0901A'].unique()
# print(sorted([int(x) for x in unique if not math.isnan(x)]))

# Print the iloc of the columns of the last 4 instruments
# print(anes_data_inp.columns.get_loc("VCF9003"))
# print(anes_data_inp.columns.get_loc("VCF9004"))
# print(anes_data_inp.columns.get_loc("VCF9005"))
# print(anes_data_inp.columns.get_loc("VCF9006"))
Example #46
0
# Import pandas
import pandas as pd
import matplotlib.pyplot as plt

stata_file = './OtherFiles/disarea.dta'

# Load Stata file into a pandas DataFrame: df
df = pd.read_stata(stata_file)

# Print the head of the DataFrame df
print(df.head())

# Plot histogram of one column of the DataFrame
pd.DataFrame.hist(df[['disa10']])
plt.xlabel('Extent of disease')
plt.ylabel('Number of countries')
plt.show()
Example #47
0
import pandas as pd

hhCompLvl1 = pd.read_stata('./Datasets/HouseholdComposition_level_1.dta')
hhCompLvl2 = pd.read_stata('./Datasets/HouseholdComposition_members_level_2.dta')

fDiaryLvl1 = pd.read_stata('./Datasets/33.FoodDiary_level_1.dta').sort_values(by=['hh_ID']).reset_index(drop=True)
fDiaryLvl2 = pd.read_stata('./Datasets/34.FoodDiary_food_level_2.dta').sort_values(by=['hh_ID', 'food_1_ID',
                                                                                       'food_2_ID'])
fDiaryLvl3 = pd.read_stata('./Datasets/35.Fooddiary_food_foodtype_level_3.dta')
# Reading in the data from stata format into a pandas data frame

fDiaryLvl1.drop(['submissiondate', 'start_time', 'end_time', 'today', 'start_date', 'expiration_date',
                'max_submissions', 'task_value', 'task_length'], axis=1, inplace=True)
fDiaryLvl2.drop(['food_label'], axis=1, inplace=True)
fDiaryLvl3.drop(['food_type_label', 'food_unit_label'], axis=1, inplace=True)
# Removing irrelevant information to this project

fDiary1Link2 = fDiaryLvl1.merge(fDiaryLvl2)
fDiary2Link3 = fDiary1Link2.merge(fDiaryLvl3).sort_values(by=['hh_ID', 'week_number']).reset_index(drop=True)
# Linking the food diary levels using the unique ID numbers

# fDiary2Link3.drop(fDiary2Link3.ix([:,''])
# Deleting a range of columns


householdIDLinked = pd.Series(fDiaryLvl1.hh_ID.unique()).sort_values()
weekNumbers = pd.Series(fDiaryLvl1.week_number.unique()).sort_values()
unitsUsed = pd.Series(fDiaryLvl3.food_type_unit.unique())
foodsList = pd.DataFrame(fDiary2Link3.food_type_name.unique(), columns=['food_type'])
# The list of different foods which will be used to calculate calories
foodGroups = pd.Series(fDiary2Link3.food_grp_namec.unique())
Example #48
0
                ucid2nsrc[ucid] = [name, stid, rid, cid]

    files = []
    with open(merged_data_files, 'r') as f:
        for file in f:
            file_path = file.splitlines()[0]
            if not os.path.exists(file_path):
                print('File %s does not exist!' % file_path)
                exit()
            files.append(file_path)

    for merged_data_file in files:
        if merged_data_file.endswith('dta'):
            fname_wo_ext = merged_data_file.split(".dta")[0]
            print("Loading dta file " + merged_data_file)
            data = pd.read_stata(merged_data_file)
            data_reader = pd.read_stata(merged_data_file, iterator=True)

            # Deal with encoding problems in data and value labels of Stata files
            for col in data.select_dtypes(include=object).columns:
                data[col] = data[col].str.encode('utf-8').str.decode(
                    'ascii', 'ignore')
                # data[col] = data[col].str.replace("\u2019", "'")
                # data[col] = data[col].str.replace("\uff0c", ",")
                # data[col] = data[col].str.replace("\uff01", "!")

            value_labels = {}
            for var in data_reader.value_labels().keys():
                var_recode = var.encode('utf-8').decode('latin-1', 'ignore')
                value_labels[var_recode] = {}
                for val in data_reader.value_labels()[var].keys():
Example #49
0
from data_functions_albert import remove_outliers
os.chdir(
    'C:/Users/rodri/OneDrive/Documentos/IDEA/Master tesi/Data & Code (Uganda)/data10'
)
pd.options.display.float_format = '{:,.2f}'.format

dollars = 2586.89

# AGRICULTURAL SEASON 1:

#Omit rents for evaluate agricultural productivity.

# =============================================================================
# Fertilizers & labor inputs
# =============================================================================
ag3a = pd.read_stata('agsec3a.dta')
ag3a = ag3a[[
    "HHID", 'pltid', "a3aq8", "a3aq19", 'a3aq31', 'a3aq38', 'a3aq39',
    'a3aq42a', 'a3aq42b', 'a3aq42c'
]]
ag3a['hhlabor'] = ag3a["a3aq39"].fillna(0)
ag3a['hired_labor'] = ag3a["a3aq42a"].fillna(0) + ag3a["a3aq42b"].fillna(
    0
) + ag3a["a3aq42c"].fillna(
    0
)  #Sum over hours men, women and kids. We assume all of them equally productive.

ag3a = ag3a[[
    "HHID", 'pltid', "a3aq8", "a3aq19", "a3aq31", 'hhlabor', 'hired_labor'
]]
ag3a.columns = [
Example #50
0
%matplotlib inline

import matplotlib.pyplot as plt
import numpy as np
import pymc3 as pm
import pandas as pd
import theano
import seaborn as sns

import os
os.chdir("/Users/alexwolf/Dropbox/Uni/WB RS Estimates/4. Estimation/rsest/")

"--- Read Data ---"
path = "../../3. Data/Combined_Sample.dta"
Data = pd.read_stata(path)

Data[['wavecode', 'w_clothm', 'ltotR']] = Data[['wavecode', 'w_clothm', 'ltotR']].round(4)

Data[['wavecode', 'w_clothm', 'ltotR']].head()

wave_idx = Data['wavecode']
clothm_mean = np.squeeze(np.mean(Data[['w_clothm']]))

with pm.Model() as hierarchical_model:
    # Hyperpriors
    mu_a = pm.Normal('mu_alpha', mu=clothm_mean, sd=10)
    sigma_a = pm.HalfCauchy('sigma_alpha', beta=2)
    mu_b = pm.Normal('mu_beta', mu=0., sd=10)
    sigma_b = pm.HalfCauchy('sigma_beta', beta=2)

    # Intercept for each county, distributed around group mean mu_a
Example #51
0
import scipy
import numpy as np
import pandas as pd

try:
    import SparseSC as SC
except ImportError:
    raise RuntimeError("SparseSC is not installed. Use 'pip install -e .' or 'conda develop .' from repo root to install in dev mode")

random.seed(12345)
np.random.seed(101101001)

pkl_file = "../replication/smoking_fits.pkl"

smoking_df = pd.read_stata("../replication/smoking.dta")
smoking_df['year'] = smoking_df['year'].astype('int')
smoking_df = smoking_df.set_index(['state', 'year'])
Y = smoking_df[['cigsale']].unstack('year')
T0 = 19
i_t = 2 #unit 3, but zero-index
treated_units = [i_t]
control_units = [u for u in range(Y.shape[0]) if u not in treated_units]


Y_names = Y.columns.get_level_values('year')
Y_pre_names = ["cigsale(" + str(i) + ")" for i in Y_names[:T0]]
print(Y.isnull().sum().sum()) #0
Y = Y.values
T = Y.shape[1]
T1 = T-T0
Example #52
0
with SAS7BDAT('sales.sas7bdat') as file:
    df_sas = file.to_data_frame()

# Print head of DataFrame
print(df_sas.head())

# Plot histogram of DataFrame features (pandas and pyplot already imported)
pd.DataFrame.hist(df_sas[['P']])
plt.ylabel('count')
plt.show()

# Import pandas
# import pandas as pd

# Load Stata file into a pandas DataFrame: df
df = pd.read_stata('disarea.dta')

# Print the head of the DataFrame df
print(df.head())

# Plot histogram of one column of the DataFrame
pd.DataFrame.hist(df[['disa10']])
plt.xlabel('Extent of disease')
plt.ylabel('Number of countries')
plt.show()

# Import packages
import numpy as np
import h5py

# Assign filename: file
Example #53
0
import pandas as pd
from idcempy import gimnl
import os
import unittest
import numpy as np

"""
Replication of results from 
Bagozzi, Benjamin E. and Kathleen Marchetti. 2017. 
"Distinguishing Occasional Abstention 
rom Routine Indifference in Models of Vote Choice." 
Political Science Research  and Methods.  5(2): 277-249.
"""

DAT = pd.read_stata(
    os.getcwd() + "/data/replicationdata.dta", convert_categoricals=False)

x = ['educ', 'female', 'black', 'hispanic', 'party7', 'w3gbvalu2',
     'presbatt',
     'south', 'gayban2', 'agegroup2', 'bornagn_w', 'brnag_wXgmb2', 'catholic',
     'cathXgmb2', 'other_rel', 'secular', 'secXgmb2', 'ideo', 'w3mobidx']
z = ['educ', 'agegroup2', 'w3mobidx', 'secular']
y = ['vote_turn']

order_Kerry = [0, 1, 2]
order_Bush = [0, 2, 1]

binflatecat = "baseline"
sinflatecat = "second"
tinflatecat = "third"
Example #54
0
from pyreghdfe import Pyreghdfe
from sklearn.datasets import load_boston
from utils import sklearn_to_df
import pandas as pd
import numpy as np
from utils import get_np_columns
import statsmodels.formula.api as smf
import econtools
import econtools.metrics as mt
import statsmodels.api as sm

df = pd.read_stata('data/cleaned_nlswork.dta')
df['hours_log'] = np.log(df['hours'])

# sanity check: cluster standard errors without absorption
#results = smf.ols(formula='wks_work~1', data=df).fit()
# y = np.asarray([[-100, 100, -100, 1234]]).T
# X = np.asarray([[1,2 ,3, 4], [1, 1, 1, 1]]).T
# results = sm.OLS(y, X).fit()
#results = smf.ols(formula='ttl_exp~wks_ue', data=df).fit()
# results = results.get_robustcov_results(cov_type='cluster',
#                                         groups=get_np_columns(df, ['idcode']))
# print(np.sum(results.resid))
# import pdb; pdb.set_trace()
pyreghdfe = Pyreghdfe(df=df,
                      target='hours_log',
                      predictors=['union'],
                      ids=['year'],
                      cluster_ids=['idcode'])
results = pyreghdfe.fit(cov_type='cluster', groups=['idcode'])
Example #55
0
# Import required packages
#------------------------------------------------------------------------------
import pandas as pd  # will be used to read .dta file by .read_stata()
import numpy as np
import matplotlib.pyplot as plt  # will be used to see the obvious relationship of desired variables in a scatterplot
#%matplotlib inline
import math  # to use in some equations
from scipy.optimize import minimize  # for optimization of Likelihood function (MLE method)
import scipy.optimize as opt
import statsmodels.api as statmod
import scipy.stats as stats

#------------------------------------------------------------------------------
# Read data and prepare to utilize (Part 1 & 2)
#------------------------------------------------------------------------------
df1 = pd.read_stata('PS3_data.dta')
# My note for later: https://www.shanelynn.ie/using-pandas-dataframe-creating-editing-viewing-data-in-python/
print("Dataframe: ")
df1.head()
'''
hlabinc = annual labor income of the head
hannhrs = annual hours of the head
hsex = gender of the head (1 = Male, 2 = Female)
hrace = race of the head (1 = white, 2 = Black, 3 = Native American, 4 = Asian/Pacific Islander, 5 = Hispanic, 6,7 = Other)
age = age of the head
hyrsed = years of education of the head
'''
print("Data Statistics:")
df1.describe()
print(
    "Scatterplot between annual labor inome of the head and years of education of the head"
Example #56
0
"""This function cleans the data from the chs_data.dta. It
   prepares the dataset we use for further analysis.
"""

import numpy as np
import pandas as pd
from bld.project_paths import project_paths_join as ppj


def save_data(sample):
    """
    Save clean data as .csv file.
    """
    sample.to_csv(ppj("OUT_DATA", "data_clean.csv"), sep=",")


if __name__ == "__main__":
    data = pd.read_stata(ppj("IN_DATA", "chs_data.dta"))
    data.replace(-100, np.nan, inplace=True)
    data = data.query("age == 0")
    data.reset_index(inplace=True)
    data = data["weightbirth"]
    data.fillna(data.mean(), inplace=True)
    save_data(data)
Example #57
0
import pandas as pd
import matplotlib.pyplot as plt
import numpy as np

df = pd.read_stata('C:/Users/林子晴/Desktop/dataset.dta')

df = df[df.taxeringsvrde > 0]
df = df[df.pris > 0]
df = df[df.taxr == 2006]
df = df[df.fakod == 1]
df = df.sort_values("kontraktsdatum", ascending=True)

month = df.groupby(df['month'])

year = df.groupby(df['year_nr'])


def get_year_price(i):
    return year.get_group(i)['pris']


def get_year_day(i):
    return year.get_group(i)['kontraktsdatum']


def get_day(k):
    return month.get_group(k)['day_nr']


def get_date(k):
    return month.get_group(k)['kontraktsdatum']
Example #58
0
def _get_data():
    filepath = dirname(abspath(__file__))
    filename = join(filepath, 'credit_cards.dta')
    data = read_stata(filename)
    data = du.strip_column_names(data)
    return data
#!/usr/bin/env python3
# -*- coding: utf-8 -*-
"""
Created on Sat Oct 20 10:31:13 2018

@author: noviayou
"""

import numpy as np
import pandas as pd

d1 = pd.read_stata('cds.dta')
d2 = pd.read_csv('WRDS.csv')
d2.columns = map(str.lower, d2.columns)

d1.gvkey = d1.gvkey.astype('int')

d2['mdate'] = d2['datadate'].apply(
    lambda x: pd.to_datetime(str(x), format='%Y%m%d'))

d1['year'] = pd.DatetimeIndex(d1['mdate']).year
d1['month'] = pd.DatetimeIndex(d1['mdate']).month

d2['year'] = pd.DatetimeIndex(d2['mdate']).year
d2['month'] = pd.DatetimeIndex(d2['mdate']).month

d2.month.unique()

#d2['quarter']= d2.apply(lambda x : 1 if d2['month'] == (2,3) else )
#
#def type(i):
                                inplace=True)
mergecat_sele_qid_location['qid_cat'] = mergecat_sele_qid_location[
    'qid_cat'].fillna(0)
mergecat_sele_qid_location[['qid_cat'
                            ]] = mergecat_sele_qid_location[['qid_cat'
                                                             ]].astype(int)
mergecat_sele_qid_location = mergecat_sele_qid_location.replace(0, np.nan)
###################step2:merge with country and get country information#######################
qid_MtCountry = qid_MtCountry.loc[:, ['qid_cat', 'qid_country', 'subject']]
qid_MtCountry.drop_duplicates(inplace=True)
mergecat_sele_qid_location_WP = pd.merge(mergecat_sele_qid_location,
                                         qid_MtCountry,
                                         on='qid_cat',
                                         how='left')

A1_05_03_mapCountry = pd.read_stata(material_path +
                                    'material/A1_05_03_mapCountry.dta')
A1_05_03_mapCountry.rename(columns={'location': 'L_or_P'}, inplace=True)
mergecat_sele_qid_location_WP = pd.merge(mergecat_sele_qid_location_WP,
                                         A1_05_03_mapCountry,
                                         on='L_or_P',
                                         how='left')

#get country information from "location"
mergecat_sele_qid_location_WP['locationLabel'] = mergecat_sele_qid_location_WP[
    'locationLabel'].str.replace(" ", "_")
A1_05_03_mapCountry.rename(columns={'L_or_P': 'locationLabel'}, inplace=True)
A1_05_03_mapCountry.rename(
    columns={'country': 'country_locationLabel_bselect'}, inplace=True)
mergecat_sele_qid_location_WP = pd.merge(mergecat_sele_qid_location_WP,
                                         A1_05_03_mapCountry,
                                         on='locationLabel',