def nber_convert_dta_to_hdf(source_dir, target_dir, index='year'): if index == 'year': fn = target_dir + "nber_year.h5" store = pd.HDFStore(fn, complevel=9, complib='zlib') for year in range(1962, 2000+1, 1): fn = source_dir + "wtf%s.dta" % str(year)[2:] print "Loading Year: %s from file: %s" % (year, fn) data = pd.read_stata(fn) store.put('Y'+str(year), data, format='table') print "HDF File Saved ..." print store store.close() else: data = pd.DataFrame() for year in range(1962, 2000+1, 1): fn = source_dir + "wtf%s.dta" % str(year)[2:] print "Loading Year: %s from file: %s" % (year, fn) data = data.append(pd.read_stata(fn)) fn = target_dir + "nber.h5" store = pd.HDFStore(fn, complevel=9, complib='zlib') store.put('nber', data, format='table') print "HDF File Saved ..." print store store.close() print "Convert DTA to HDF Finished!"
def load_trade4digit_country(): prescriptives = pd.read_stata(prefix_path("Trade/exp_ecomplexity_rc.dta")) exports = pd.read_stata(prefix_path("Trade/exp_rpy_rc_p4.dta")) exports = exports.rename(columns={"X_rpy_d": "export_value", "NP_rpy": "export_num_plants"}) imports = pd.read_stata(prefix_path("Trade/imp_rpy_rc_p4.dta")) imports = imports.rename(columns={"X_rpy_d": "import_value", "NP_rpy": "import_num_plants"}) descriptives = exports.merge(imports, on=["yr", "r", "p"], how="outer") descriptives = descriptives.fillna({ "export_value": 0, "export_num_plants": 0, "import_value": 0, "import_num_plants": 0, }) combo = prescriptives.merge(descriptives, left_on=["yr", "r", "p4"], right_on=["yr", "r", "p"]) combo = combo[combo.yr.between(YEAR_MIN_TRADE, YEAR_MAX_TRADE)] combo["r"] = "COL" return combo
def stata_use(self, args, opts=[]): def parse_filename(fname_arg): ext = '.dta' if fname_arg.endswith(ext): return(fname_arg) return(fname_arg + ext) ## Defaults labels = True ## Opts if "nolabel" in opts or "nol" in opts: labels = False if "clear" not in opts: if self.data is not None: print "you must start with an empty data set" return if 'using' not in args: fname = parse_filename(args[0]) self.data = pd.read_stata(fname, convert_categoricals=labels) else: fname = parse_filename(args[args.index('using')+1]) self.data = pd.read_stata(fname, convert_categoricals=labels) if 'if' in args: ifcondition = args[args.index('if')+1:args.index('using')] # todo: 'in' self.data = self.if_filter(ifcondition) varlist = args[:args.index('if')] else: varlist = args[:args.index('using')] if len(varlist) > 0: self.data = self.data[varlist]
def compare(self, seuil_abs = 100, seuil_rel = 0.10): ''' Fonction qui comparent les calculs d'OF et et de TaxIPP Gestion des outputs ''' dta_output = self.paths['dta_output'] ipp_output = read_stata(dta_output).fillna(0) dta_input = self.paths['dta_input'] ipp_input = read_stata(dta_input).fillna(0) openfisca_output = self.openfisca_outputput.fillna(0) openfisca_input = self.simulation.input_table.table ipp2of_output_variables = self.dic_var_output check_list = ['csg_sal_ded', 'irpp_net_foy', 'af_foys'] # 'csg_sal_ded', print self.dic_param for ipp_var in check_list: of_var = ipp2of_output_variables[ipp_var] entity = self.simulation.prestation_by_name[of_var].entity if entity == 'ind': conflict = ((ipp_output[ipp_var] - openfisca_output[of_var].abs()).abs() < THRESHOLD) print conflict.to_string() print ipp_output.loc[not_(conflict), ipp_var].to_string() print openfisca_output.loc[not_(conflict), of_var].to_string() print ipp_input.loc[not_(conflict), ].to_string() print openfisca_input.loc[not_(conflict), self.relevant_input_variables()].to_string() #error_diag() # TODO: finish by calling error_diag elif entity == "fam": pass elif entity == "foy": openfisca_foy = openfisca_output.loc[ openfisca_input.quifoy == 0, of_var] ipp_foy = ipp_output.loc[ openfisca_input.quifoy == 0, ipp_var] print ipp_foy conflict = ((ipp_foy - openfisca_foy.abs()).abs() > THRESHOLD) print conflict.to_string() elif entity == "men": pass def _diff(seuil_abs, seuil_rel): for k, v in dic.items() : diff_abs = ipp_output[k].mean() - openfisca_output[v].mean() if diff_abs > seuil_abs : print " Différence absolue pour ", k, ' : ', diff_abs diff_rel = (ipp_output.loc[(ipp_output[k] != 0) & (openfisca_output[v] != 0), k] /openfisca_output.loc[(ipp_output[k] != 0) & (openfisca_output[v] != 0), v] ).mean() if (diff_rel > seuil_rel) & (diff_rel is not None) : print " Différence relative pour ", k, ' : ', diff_rel
def load_pwt_data(version=80, deltas=False): """ Load the Penn World Tables data as a Pandas Panel object. Function expects a local copy of the PWT data file in the working directory. If no local copy exists, then one will be downloaded automatically (assuming that you have a working internet connection!). Arguments: version: (int) Version number for PWT data. Default is 80 (which is the most recent version). deltas: (boolean) Whether or not you wish to load the data on depreciation rates (which is included in a separate .dta file). Default is False. Returns: pwt: A Pandas Panel object containing the Penn World Tables data. TODO: Work out a way to merge Pandas Panel objects. """ # first check for a local copy of PWT try: data = pd.read_stata('pwt' + str(version) + '.dta') # otherwise, except IOError: get_pwt_data(version) data = pd.read_stata('pwt' + str(version) + '.dta') # convert to a panel object data.index =[data['year'], data['countrycode']] data = data.to_panel() if deltas == True: # load the separate file containing the depreciation rates dep_rates = pd.read_stata('depreciation_rates.dta') # convert to panel dep_rates.index = [dep_rates['year'], dep_rates['countrycode']] dep_rates = dep_rates.to_panel() return [data, dep_rates] else: return data
def load_dta(filename, chunksize = 5000, **kwargs): # use columns as a list reader = pd.read_stata(filename, iterator=True, **kwargs) df = pd.DataFrame() try: chunk = reader.get_chunk(chunksize) i = 0 while len(chunk) > 0: i += 1 df = df.append(chunk, ignore_index=True) print 'Loaded {} rows...'.format(len(df)) sys.stdout.flush() chunk = reader.get_chunk(chunksize) print 'Done!' return df except StopIteration: # I guess this is an acceptable exception? print 'Done!' return df except Exception as e: print type(e).__name__ + ': ' + str(e) return df
def fill_hdf_from_stata(self, table): from pandas import read_stata assert table in self.tables, "Table {} is not a filed table".format(table) stata_file = self.tables[table]["stata_file"] try: variables = self.tables[table]['variables'] except: variables = None if not os.path.isfile(stata_file): raise Exception("file_path {} do not exists".format(stata_file)) log.info("Inserting stata table {} in file {} in HDF file {} at point {}".format( table, stata_file, self.hdf5_file_path, table, ) ) stored_dataframe = read_stata(stata_file) store_path = table if variables is not None: log.info('variables asked by the user: {}'.format(variables)) variables_stored = list(set(variables).intersection(set(stored_dataframe.columns))) log.info('variables stored: {}'.format(variables_stored)) stored_dataframe[variables_stored].to_hdf(self.hdf5_file_path, store_path, format = 'table', append = False) else: try: stored_dataframe.to_hdf(self.hdf5_file_path, store_path, format = 'table', append = False) except: stored_dataframe.to_hdf(self.hdf5_file_path, store_path, append = False) gc.collect()
def getPats(): """ Loads NBER patent data, 1976-2006, and returns two data frames. One df with biotech patents, defined by the icl and icl_class, and one df with ai patents, defined by nclass. """ # Import patent data orig7606 = pd.read_stata(nberPath+'/orig_gen_76_06.dta') pat7606 = pd.DataFrame.from_csv(nberPath+'/pat76_06_ipc.csv') # Since pat76-06 has multiple records for each patent = number of assigness, remove duplicates so there is one record per patent pat7606 = pat7606.drop_duplicates(cols=['patent']) pat7606 = pat7606.drop(['uspto_assignee','year'],axis=1) # Put originality, generality, and citation measures onto the pat76-06 dataframe pat7606 = pd.merge(pat7606, orig7606, on=['patent'], how='left') # Filter down to only bio-pats using hard-coded classifications from (REFERENCE) bioICL = pd.DataFrame(['A01H 100','A01H 400','A61K 3800','A61K 3900','A61K 4800','C02F 334','C07G 1100','C07G 1300','C07G 1500','C07K 400','C07K 1400','C07K 1600','C07K 1700','C07K 1900','G01N 27327','G01N 3353','G01N 33531','G01N 33532','G01N 33533','G01N 33534','G01N 33535','G01N 33536','G01N 33537','G01N 33538','G01N 33539','G01N 3354','G01N 33541','G01N 33542','G01N 33543','G01N 33544','G01N 33545','G01N 33546','G01N 33547','G01N 33548','G01N 33549','G01N 3355','G01N 33551','G01N 33552','G01N 33553','G01N 33554','G01N 33555','G01N 33556','G01N 33557','G01N 33558','G01N 33559','G01N 3357','G01N 33571','G01N 33572','G01N 33573','G01N 33574','G01N 33575','G01N 33576','G01N 33577','G01N 33578','G01N 33579','G01N 3368','G01N 3374','G01N 3376','G01N 3378','G01N 3388','G01N 3392'], columns=['icl']) bioICLclass = pd.DataFrame(['C12M','C12N','C12P','C12Q','C12S'],columns=['icl_class']) bioPats1 = pat7606.merge(bioICL,on=['icl']) bioPats2 = pat7606.merge(bioICLclass,on=['icl_class']) bioPats = pd.concat([bioPats1,bioPats2]) # Filter down to ai pats aiPats = pat7606[pat7606.nclass==706] return bioPats, aiPats
def main(): print "Enter a filename" filename = raw_input() if ".dta" in filename: df = pd.read_stata(os.path.join(RAW_DATA, filename)) # I want to treat these as string values rather than actually missing elif ".csv" in filename: df = pd.read_csv(os.path.join(RAW_DATA, filename)) df.fillna("missing") else: sys.exit("File type not detected or supported. .csv or .dta accepted") df.columns = [x.lower() for x in df.columns] deviceList = [] for i in range(len(df.index)): observation = df.loc[i] os_type = determine_os_type(observation) df.loc[i,'devicetype'] = os_type #print df.at[0, 'responseid'] for x in sorted(get_column_levels(df, COLUMNS_DICT['useragent'])): print "Warning: uknown device type for entries labeled {}".format(x) # To save space I'm only going to write an id variable and the newly created devicetype try: df = df[['responseid', 'devicetype','bmi_useragent']] except KeyError: df = df[['id', 'devicetype', 'bmi_useragent' ]] df.to_stata(os.path.join(OUT_DIR,'{}_devices.dta'.format(filename.replace(".dta","").replace(".csv",""))), write_index=False)
def dta2dbf(dta_path,dbf_path): """ Wrapper function to convert a stata dta file into a dbf file. __author__ = "Luc Anselin <*****@*****.**> " Uses df2dbf to write the dbf files from a pandas DataFrame. Uses all default settings for df2dbf (see docs for specifics). ... Arguments --------- dta_path : str Path to the Stata dta file dbf_path : str Path to the output dbf file Returns ------- dbf_path : path to output file """ db = pd.read_stata(dta_path) dp = df2dbf(db,dbf_path) return dp
def __init__(self, filename): if(filename.endswith('csv')): pd.DataFrame.__init__(self, data=pd.read_csv(filename)) elif(filename.endswith('tab')) : pd.DataFrame.__init__(self, data=pd.read_csv(filename, sep='\t')) elif(filename.endswith('dta')): pd.DataFrame.__init__(self, data=pd.read_stata(filename))
def dta_pandas(fp): """Read and convert a dta file to JSON format using the pandas library :param fp: File pointer object :return: tuple of table headers and data """ dataframe = pandas.read_stata(fp) return data_from_dataframe(dataframe)
def loadSTATA(self): try: df = pd.read_stata(str(self.filename)) except: print "unexpected error occured" return print df.head() l = list(df.columns) print df.head() head = self.tableWidget_3.horizontalHeader() head.setStretchLastSection(True) nrow = len(df.index) if nrow>100: nrow = 100 else: nrow = nrow #self.datatable = QtGui.QTableWidget(parent=self) self.tableWidget_3.setColumnCount(len(df.columns)) self.tableWidget_3.setRowCount(nrow) for i in range(nrow): for j in range(len(df.columns)): self.tableWidget_3.setItem(i,j,QtGui.QTableWidgetItem(str(df.iget_value(i, j)))) self.tableWidget_3.setHorizontalHeaderLabels(l) self.headerName = l self.nonSelectedVariables = self.headerName self.data = df st = str(nrow)+" of "+str(len(df.index))+" rows has been shown" self.label.setText(st) self.label.setVisible(True) self.initDict() self.initComboBox()
def webuse(data, baseurl='https://www.stata-press.com/data/r11/', as_df=True): """ Download and return an example dataset from Stata. Parameters ---------- data : str Name of dataset to fetch. baseurl : str The base URL to the stata datasets. as_df : bool Deprecated. Always returns a DataFrame Returns ------- dta : DataFrame A DataFrame containing the Stata dataset. Examples -------- >>> dta = webuse('auto') Notes ----- Make sure baseurl has trailing forward slash. Doesn't do any error checking in response URLs. """ url = urljoin(baseurl, data+'.dta') return read_stata(url)
def _open_stata_file(self, path): stata_file = pd.read_stata( path, convert_categoricals=False, order_categoricals=False, iterator=True, ) return stata_file
def output(): files = glob.glob("".join((gen_input, "*.dta"))) for file in files: stata_file = pandas.read_stata(file, encoding='latin1') name = basename(file) stata_file = stata_file.drop("level_0", 1) name = name.replace(".dta", "gen.dta") stata_file.to_stata("".join((gen_output, name)))
def get_pwt_data(version=80, extract=True): """ Load the Penn World Tables data. If no local copy of the data is found, then a copy will be downloaded from the web. Arguments: version: (int) Version number for PWT data. Default is 80 (which is the most recent version). extract: (boolean) Whether or not you wish to save local copy of the PWT data in your working directory. Default is True. Returns: pwt: A Pandas Panel object containing the Penn World Tables data. TODO: Convert time index to datetime object. """ # first check for a local copy of PWT try: path = 'pwt' + str(version) + '.dta' pwt = pd.read_stata(path) # otherwise, download the appropriate zip file except IOError: url = ('http://www.rug.nl/research/GGDC/data/pwt/V' + str(version) + '/pwt' + str(version) + '.zip') archive = zipfile.ZipFile(StringIO(urlopen(url).read()), 'r') # to extract or not to extract... tmp_file = 'pwt' + str(version) + '.dta' if extract == True: archive.extractall() pwt = pd.read_stata(tmp_file) else: pwt = archive.read(tmp_file) pwt = pd.read_stata(StringIO(pwt)) # convert to Pandas Panel object pwt.index = [pwt['year'], pwt['countrycode']] pwt = pwt.to_panel() return pwt
def nber_convert_dta_to_csv(source_dir, target_dir): for year in range(1962, 2000+1, 1): fn = source_dir + "wtf%s.dta" % str(year)[2:] print "Loading Year: %s from file: %s" % (year, fn) data = pd.read_stata(fn) fn = target_dir + "wtf%s.csv" % str(year)[2:] print "Converting Year: %s from file: %s" % (year, fn) data.to_csv(fn, index=False, quoting=csv.QUOTE_NONNUMERIC) print "Convert DTA to CSV Finished!"
def read_stata(stata_name): logger.info("read \"" + stata_name + "\"") data = pd.read_stata( stata_name, iterator=True, convert_categoricals=False, ) d, m = parse_dataset(data, stata_name) return d, m
def batch_process(func, prefix): for filename in filenames[first:last]: basename = os.path.splitext(os.path.basename(filename))[0] outname = tmp + "/" + prefix + basename + '.csv' if os.path.isfile(outname): continue func(pd.read_stata(filename)).to_csv(outname, header=True)
def _load_data_stata(self, symbol, file_path): """ Load .dta data files """ # Load the data file with no header information, indexed on date self.symbol_data[symbol] = pd.read_stata( file_path, header=0, index_col=0, parse_dates=True ).sort_index()
def output(): files = glob.glob("".join((cleaning_input, "*.dta"))) for file in files: stata_file = pandas.read_stata(file, encoding='latin1') name = basename(file) print(name) stata_file = stata_file.drop("level_0", 1) name = re.sub("-raw", "", name) stata_file.to_stata("".join((cleaning_output, name)))
def load_trade4digit_municipality(): exports = pd.read_stata(prefix_path("Trade/exp_rpy_r5_p4.dta")) exports = exports.rename(columns={"X_rpy_d": "export_value", "NP_rpy": "export_num_plants"}) imports = pd.read_stata(prefix_path("Trade/imp_rpy_r5_p4.dta")) imports = imports.rename(columns={"X_rpy_d": "import_value", "NP_rpy": "import_num_plants"}) descriptives = exports.merge(imports, on=["yr", "r", "p"], how="outer") descriptives = descriptives.fillna({ "export_value": 0, "export_num_plants": 0, "import_value": 0, "import_num_plants": 0, }) descriptives = descriptives[descriptives.yr.between(YEAR_MIN_TRADE, YEAR_MAX_TRADE)] return descriptives
def test_export_data_G(self): #-pyeconlab-# self.obj.construct_sitc_dataset(data_type='export', dataset="G", product_level=3, sitc_revision=2, report=False, verbose=False) #-stata-# self.G = pd.read_stata(TEST_DATA_DIR + "nberwtf_stata_export_sitcr2l3_1962to2000_G.dta") self.G.sort(['year', 'eiso3c', 'sitc3'], inplace=True) self.G.reset_index(inplace=True) del self.G['index'] assert_allclose(self.obj.dataset['value'].values, self.G['value'].values) del self.G
def test_bilateral_data_F(self): #-pyeconlab-# self.obj.construct_sitc_dataset(data_type='trade', dataset="F", product_level=3, sitc_revision=2, report=False, verbose=False) #-stata-# self.F = pd.read_stata(TEST_DATA_DIR + "nberwtf_stata_trade_sitcr2l3_1962to2000_F.dta") self.F = self.F.sort(['year', 'eiso3c', 'iiso3c', 'sitc3']) self.F = self.F.reset_index() del self.F['index'] assert_allclose(self.obj.dataset['value'].values, self.F['value'].values) del self.F
def test_bilateral_data_A(self): # -pyeconlab-# self.obj.construct_dataset_SC_CNTRY_SR2L3_Y62to00_A(data_type="trade") # -stata-# self.A = pd.read_stata(TEST_DATA_DIR + "A-nberfeenstrawtf_do_stata_basic_country_sitc3_bilateral.dta") self.A.sort(["year", "eiso3c", "iiso3c", "sitc3"], inplace=True) self.A.reset_index(inplace=True) del self.A["index"] assert_allclose(self.obj.dataset["value"].values, self.A["value"].values) del self.A
def setup_class(cls): """Stata reg output from `sysuse auto; reg price mpg`""" cls.init(cls) test_path = path.split(path.relpath(__file__))[0] auto_path = path.join(test_path, 'data', 'auto.dta') autodata = pd.read_stata(auto_path) y = 'price' x = ['mpg', 'length'] cls.result = reg(autodata, y, x, cluster='gear_ratio', addcons=True) cls.expected = ols_cluster
def test_import_data_A(self): #-pyeconlab-# self.obj.construct_sitc_dataset(data_type='import', dataset="A", product_level=3, sitc_revision=2, report=False, verbose=False) #-stata-# self.A = pd.read_stata(TEST_DATA_DIR + "bacihs96_stata_import_sitcr2l3_1998to2012_A.dta") self.A.sort(['year', 'iiso3c', 'sitc3'], inplace=True) self.A.reset_index(inplace=True) del self.A['index'] assert_allclose(self.obj.dataset['value'].values, self.A['value'].values) del self.A
def get_labor_data(): # read in "raw" CPS data to calculate moments # these data were actually cleaned in hours_data_cps_setup.do fileDir = os.path.dirname(os.path.realpath('__file__')) filename = os.path.join(fileDir, '../Data/Current_Population_Survey/cps_est_ability_hours_1992to2013.dta') filename = os.path.abspath(os.path.realpath(filename)) cps = pd.read_stata(filename, columns=['year', 'age', 'hours', 'hours_unit', 'wtsupp']) return cps
def test_import_data_D(self): # a to_import would be more efficient here # -pyeconlab-# self.obj.construct_dataset_SC_CNTRY_SR2L3_Y62to00_D(data_type="import") # -stata-# self.D = pd.read_stata(TEST_DATA_DIR + "D-nberfeenstrawtf_do_stata_basic_country_sitc3_imports.dta") self.D.sort(["year", "iiso3c", "sitc3"], inplace=True) self.D.reset_index(inplace=True) del self.D["index"] assert_allclose(self.obj.dataset["value"].values, self.D["value"].values) del self.D
respgender.columns = ['prop'] respgender = respgender.rename(columns={'level_1': 'category'}) # make educ with cats in col educcats = [c for c in descr.index if 'educ_cat' in c] respeduc = descr[descr.index.isin(educcats)][['group', 'mean']] # rows to cols pivot # respeduc = pd.pivot_table(respeduc, index=['group', 'variable']).reset_index() ##agecategories with weighted stats # load in clean data clean = pd.read_stata(clean / "PAPAB Impact study - Ready for analysis.dta") # ages_df = clean.loc[:, [ 'age', 'pip_generation_clean', 'weight_generation', 'weight_generation_inv' ]] print(ages_df.age.isna().sum()) # no missing values bins = [0, 24, 34, 44, 54, 64, np.inf] names = ['<24', '25-34', '35-44', '45-54', '55-64', '65+'] ages_df['agerange'] = pd.cut(ages_df['age'], bins, labels=names) ages_dum = pd.get_dummies(ages_df['agerange']) ages_df = pd.concat([ages_df, ages_dum], axis=1, sort=False) #now aggregate averages across cats + a weight for all pip
import pandas as pd import difflib asha_df = pd.read_stata('E:/JSY MIS data/ASHA/data_asha_nrhm_without15.dta') dhs_df = pd.read_stata('E:/JSY MIS data/ASHA/st_dist_dhs_2.dta') print(len(asha_df)) def fix_spelling(x, source): try: return difflib.get_close_matches(x, source)[0] except Exception: return "NA" asha_df['s_nrhm'] = asha_df['s_nrhm'].apply( lambda x: fix_spelling(x, dhs_df['s_nrhm'])) asha_df['sdist'] = asha_df['sdist'].apply( lambda x: fix_spelling(x, dhs_df['sdist'])) merged_df = asha_df.merge(dhs_df, 'inner', on=['s_nrhm', 'sdist']) merged_df.to_stata('E:/JSY MIS data/ASHA/asha_dhs_merge_state_dist.dta') print(merged_df.head(25)) print(len(merged_df))
def prepare_data_figure12(data_cwhsa, data_cwhsb): """ Take CWHS data set for FICA earnings and prepare it such that it can be used for plotting Figure 1 and 2. Parameters ---------- data_cwhsa : pd.DataFrame CWHS data until 1978. data_cwhsb : pd.DataFrame CWHS data from 1978 on. Returns ------- data_temp : pd.DataFrame Contains the FICA earnings in 1978 dollar terms across those eligible for the draft lottery for both whites and nonwhites for different years and birth cohorts. difference : pd.DataFrame Contains the difference in FICA earnings in 1978 dollar terms between eligible and ineligible for different groups (by ethnicity, year and birth cohort). """ # read data for years 64 to 77 data = data_cwhsa # declare that it is FICA earnings data["type"] = "TAXAB" # read data for the years after 77 temp_data = data_cwhsb data = data.append(temp_data) data = data.loc[(data["year"] > 65) & (data["byr"] >= 50)] # create eligibility dummy data["eligible"] = 0 data.loc[((data["byr"] >= 44) & (data["byr"] <= 50) & (data["interval"] <= 39)) | ((data["byr"] == 51) & (data["interval"] <= 25)) | (((data["byr"] == 52) | (data["byr"] == 53)) & (data["interval"] <= 19)), "eligible", ] = 1 # add the cpi to the data data_cpi = pd.read_stata("data/cpi_angrist1990.dta") data = pd.merge(data, data_cpi, on="year") # keep only FICA earnings data = data.loc[data["type"] == "TAXAB"] # create the average earnings for those with nonzero earnings data["earnings"] = data["vmn1"] / (1 - data["vfin1"]) # create the sample size for those with nonzero earnings data["weights"] = data["vnu1"] * (1 - data["vfin1"]) # create real earnings in 1978 terms data["cpi"] = (data["cpi"] / data.loc[data["year"] == 78, "cpi"].mean()).round(3) data["real_earnings"] = data["earnings"] / data["cpi"] # adjust earnings like in description below Figure 1 in the paper for cohort, addition in [(50, 3000), (51, 2000), (52, 1000)]: data.loc[data["byr"] == cohort, "real_earnings"] = ( data.loc[data["byr"] == cohort, "real_earnings"] + addition) table = data.fillna(0) # drop groups where the weight sums to zero (i.e. where there are no positive earnings) sum_group_weights = (table.groupby(["race", "byr", "year", "eligible" ])["weights"].sum().to_frame()) nonzero_weights_index = sum_group_weights.loc[ sum_group_weights["weights"] != 0].index table.set_index(["race", "byr", "year", "eligible"], inplace=True) table = table.loc[nonzero_weights_index] # get weighted averages within by groups of ethnicity, cohort, year and draft eligibility data_temp = table.groupby( ["race", "byr", "year", "eligible"]).apply(lambda x: np.average( x[["real_earnings"]], weights=x["weights"], axis=0)) data_temp = pd.DataFrame(data_temp.to_list(), columns=["real_earnings"], index=data_temp.index) # create dataframe with the differences in real earnings # for the above groups across eligibility difference = pd.DataFrame(index=data_temp.index, columns=["difference"]) difference = difference.loc[(slice(None), slice(None), slice(None), 0), :] difference.reset_index("eligible", drop=True, inplace=True) difference["difference"] = ( data_temp.loc[(slice(None), slice(None), slice(None), 1), :].values - data_temp.loc[(slice(None), slice(None), slice(None), 0), :].values) difference.reset_index("year", inplace=True) data_temp.reset_index("year", inplace=True) return data_temp, difference
s_last = str(last) if last >= 10 else '0{}'.format(last) if first != -1 and first != last and last > round(first + delta, 3): p.append('{}{}-{}{}'.format(letter, s_first, letter, s_last)) elif first != -1 and first != last and last == round(first + delta, 3): p.append(letter + s_first) p.append(letter + s_last) elif first != -1: p.append(letter + s_first) return (p) ## ######################### ## Run Program ## ########################## ## Determine uid_list. Set uid_list to the uid argument if it is an integer oc = pd.read_stata(data_folder + '/unique_ids_and_codes.dta', encoding='latin-1') uid_list = [int(uid)] if '{}'.format(uid).isdigit() else set( oc['uniqid'].values.astype(int)) # Import data input_dataset = pd.read_stata(data_folder + '/recalculation_input.dta', encoding='latin-1') ## For each uniqid found in code_breakdowns.dta... for uid in uid_list: print('\n uid: {}'.format(uid)) ## add data for the given uid dt = input_dataset[input_dataset['uniqid'] == uid] colnames = [col for col in dt.columns if 'metric' in col] colnames.append('cause')
# In[3]: import numpy as np import matplotlib.pyplot as plt get_ipython().run_line_magic('matplotlib', 'inline') import pandas as pd import statsmodels.api as sm from statsmodels.iolib.summary2 import summary_col from linearmodels.iv import IV2SLS # In[4]: df1 = pd.read_stata('https://github.com/QuantEcon/lecture-python/blob/master/source/_static/lecture_specific/ols/maketable1.dta?raw=true') df1.head() # In[5]: plt.style.use('seaborn') df1.plot(x='avexpr', y='logpgp95', kind='scatter') plt.show() # In[6]:
print("Null Values:") print(dataf.apply(lambda x: sum(x.isnull()) / len(dataf))) ############################################################################## input_path = "/Users/christianhilscher/Desktop/dynsim/input/" model_path = "/Users/christianhilscher/desktop/dynsim/src/estimation/models/04/" output_path = "/Users/christianhilscher/Desktop/dynsim/output/" estimation_path = "/Users/christianhilscher/desktop/dynsim/src/estimation/" sim_path = "/Users/christianhilscher/desktop/dynsim/src/sim/" os.chdir("/Users/christianhilscher/desktop/dynsim/src/data_preparation/") from cleaning import SOEP_to_df from data_prep import SOEP_to_df_old # Making new dataset from original SOEP df_pgen = pd.read_stata("/Volumes/B/soep.v35/STATA_DEEN_v35/Stata/pgen.dta") df_hgen = pd.read_stata("/Volumes/B/soep.v35/STATA_DEEN_v35/Stata/hgen.dta") df_ppathl = pd.read_stata("/Volumes/B/soep.v35/STATA_DEEN_v35/Stata/ppathl.dta") df_hpathl = pd.read_stata("/Volumes/B/soep.v35/STATA_DEEN_v35/Stata/hpathl.dta") df_hbrutto = pd.read_stata("/Volumes/B/soep.v35/STATA_DEEN_v35/Stata/hbrutto.dta") df_pgen = df_pgen[["hid", "pid" , "syear", "pglabgro", "pgemplst", "pglfs", "pgtatzeit", "pgerwzeit", "pgpsbil", "pgfamstd"]] df_hgen = df_hgen[["hid", "syear", "hgheat", "hgrent", "hgtyp1hh"]] df_ppathl = df_ppathl[["hid", "pid", "syear", "sex", "gebjahr", "migback", "phrf"]] df_hpathl = df_hpathl[["hid", "syear", "hhrf"]] df_hbrutto = df_hbrutto[["hid", "syear", "bula"]] # Merging datasets from SOEP person_df = pd.merge(df_pgen, df_ppathl, on=["pid", "syear"], how="left")
dq_utmCA = r"{}/output/dq_utmCA.dta".format(buildPath) # Property selection should only have one variable: sr_property_id dq_selection = "" # Temp files temp_select = "temp_select.shp" unified = "temp_unified.shp" dist_temp = "temp_top100_lines.csv" # California Transverse Mercator Projection # (http://spatialreference.org/ref/sr-org/7098/) # Also known as UTM Zone 10.5 ctm = ctm_proj() # Now merge reprojected house locations with selection of houses dfproj = pd.read_stata(dq_utmCA) if dq_selection != "": dfselect = pd.read_stata(dq_selection) df = pd.merge(left=dfproj,right=dfselect, how='right', left_on='sr_property_id', right_on='sr_property_id') else: df = dfproj print('df read') # Reproject top 100 reproject(top100_lines_raw,lines_utmCA,ctm) # Caluclate distances with fiona.open(lines_utmCA) as source: dataout_lines = dq_to_feats(df,source)
# In[2]: import pandas as pd # ## Load the data. # # --- # # If this fails, be sure that you've saved your own data in the prescribed location, then retry. # In[3]: file = "../data/interim/1976data.dta" # Matt Wilson converted the older Stata file to the one we use. df_rawest = pd.read_stata(file) # In[4]: good_columns = [ # Demographic 'V763174', # SUMMARY-R'S PARTY ID 'V763796', # OPIN:WHEN ALLOW ABORTION (1: never) 'V763273', # Private vs public insurance 'V763241', # GOVT GUAR JOB/S.L (1: guarantee) 'V763353', # Gov't should spend less, even if cutting health and education. 'V763264', # MNRTY GRP AID SCL (1: help) # 'V763757', # THE POOR ARE POOR BECAUSE THE AMERICAN WAY OF LIFE DOESN'T GIVE ALL PEOPLE AN EQUAL CHANCE? (1: agree) ] df_raw = df_rawest[good_columns]
def lines_cases_per_million_not_log(): # chart_width = 550 chart_height = 400 # hide data = pd.read_csv("static/csv/time_series_covid19_confirmed_global.csv", error_bad_lines=False) data = data.drop(columns=["Lat", "Long"]) data = data.melt(id_vars=["Province/State", "Country/Region"]) data = pd.DataFrame(data.groupby(['Country/Region', "variable"]).sum()) data.reset_index(inplace=True) data = data.rename(columns={"Country/Region": "location", "variable": "date", "value": "total_cases"}) data['date'] = pd.to_datetime(data.date) data = data.sort_values(by="date") data.loc[data.location == "US", "location"] = "United States" data.loc[data.location == "Korea, South", "location"] = "South Korea" data_pwt = pd.read_stata("https://www.rug.nl/ggdc/docs/pwt91.dta") filter1 = data_pwt["year"] == 2017 data_pop = data_pwt[filter1] data_pop = data_pop[["country", "pop"]] data_pop.loc[data_pop.country == "Republic of Korea", "country"] = "South Korea" data_pop.loc[data_pop.country == "Iran (Islamic Republic of)", "country"] = "Iran" # per habitant data_pc = data.copy() countries = ["China", "Italy", "Spain", "France", "United Kingdom", "Germany", "Portugal", "United States", "Singapore", "South Korea", "Japan", "Brazil", "Iran", 'Netherlands', 'Belgium', 'Sweden', 'Switzerland', 'Norway', 'Denmark', 'Austria', 'Slovenia', 'Greece', 'Cyprus'] data_countries = [] data_countries_pc = [] MIN_DEATHS = 10 filter_min_dead = data_pc.total_cases < MIN_DEATHS data_pc = data_pc.drop(data_pc[filter_min_dead].index) # compute per habitant for i in countries: data_pc.loc[data_pc.location == i, "total_cases"] = data_pc.loc[data_pc.location == i, "total_cases"] / float( data_pop.loc[data_pop.country == i, "pop"]) # get each country time series filter1 = data_pc["total_cases"] > 1 for i in countries: filter_country = data_pc["location"] == i data_countries_pc.append(data_pc[filter_country & filter1]) # hide_input # Stack data to get it to Altair dataframe format data_countries_pc2 = data_countries_pc.copy() for i in range(0, len(countries)): data_countries_pc2[i] = data_countries_pc2[i].reset_index() data_countries_pc2[i]['n_days'] = data_countries_pc2[i].index data_countries_pc2[i]['log_cases'] = np.log10(data_countries_pc2[i]["total_cases"]) data_plot = data_countries_pc2[0] for i in range(1, len(countries)): data_plot = pd.concat([data_plot, data_countries_pc2[i]], axis=0) data_plot["trend_4days"] = np.log10(2) / 4 * data_plot["n_days"] data_plot["trend_12days"] = np.log10(2) / 12 * data_plot["n_days"] data_plot["trend_4days_label"] = "Doubles evey 4 days" data_plot["trend_12days_label"] = "Doubles every 12 days" # Plot it using Altair source = data_plot fig = px.line(data_plot, x="n_days", y="total_cases", color='location') fig.update_layout(xaxis_title='Day Since first infection', yaxis_title='#cases/million') return fig
def ScrapeIslands(self, backUpFile='backUpFile.csv'): inputFile = self.infile if inputFile[-3:] == 'csv': remainingPids = pd.read_csv(inputFile).filter(['pid']) elif inputFile[-3:] == 'dta': remainingPids = pd.read_stata(inputFile).filter(['pid']) elif inputFile[-3:] == 'lsx': remainingPids = pd.read_excel(inputFile).filter(['pid']) t0 = time.time() islandID = 0 total = remainingPids.count()[0] finalcsv = pd.DataFrame() finished = False print('Starting scrape') while (not (finished)): pid = remainingPids.iloc[0, remainingPids.columns.get_loc('pid')] #pid is inserted 3 times because scrapeperson often breaks if only fed one pid temp = pd.DataFrame(data={'pid': [pid, pid, pid]}) size = temp.drop_duplicates().count()[0] completeIsland = temp.drop_duplicates() growing = True try: while (growing): temp.to_csv(r'in.csv') finaldf = pd.DataFrame() with suppress_stdout(): try: self.ScrapePerson() except: print("Invalid pid " + pid) outFile = pd.read_csv('out.csv', usecols=[ 'FSID', 'spouse_FSID', 'kid_FSIDs', 'dad_FSIDs', 'mom_FSIDs' ]) #only used on the first pass, gets rid of extra pids needed to run scrapeperson if size == 1: outFile = outFile.drop_duplicates() outFile = outFile.rename( columns={ 'FSID': 'pid', 'spouse_FSID': 'spouse', 'kid_FSIDs': 'kids', 'dad_FSIDs': 'dad', 'mom_FSIDs': 'mom' }) #do by column, rename pid, merge and drop duplicates counts = outFile.count() if counts.get('spouse') > 0: pidSpouse = \ (outFile.set_index(outFile.columns.drop('spouse',1).tolist()) .spouse.str.split(';', expand=True) .stack() .reset_index() .rename(columns={0:'spouse'}) .loc[:, outFile.columns] ) pidSpouse = pidSpouse.filter(['spouse' ]).rename(columns={ 'spouse': 'pid' }).drop_duplicates() finaldf = finaldf.append(pidSpouse, ignore_index=True) if counts.get('kids') > 0: pidKids = \ (outFile.set_index(outFile.columns.drop('kids',1).tolist()) .kids.str.split(';', expand=True) .stack() .reset_index() .rename(columns={0:'kids'}) .loc[:, outFile.columns] ) pidKids = pidKids.filter(['kids']).rename(columns={ 'kids': 'pid' }).drop_duplicates() finaldf = finaldf.append(pidKids, ignore_index=True) if counts.get('dad') > 0: pidDad = \ (outFile.set_index(outFile.columns.drop('dad',1).tolist()) .dad.str.split(';', expand=True) .stack() .reset_index() .rename(columns={0:'dad'}) .loc[:, outFile.columns] ) pidDad = pidDad.filter(['dad']).rename(columns={ 'dad': 'pid' }).drop_duplicates() finaldf = finaldf.append(pidDad, ignore_index=True) if counts.get('mom') > 0: pidMom = \ (outFile.set_index(outFile.columns.drop('mom',1).tolist()) .mom.str.split(';', expand=True) .stack() .reset_index() .rename(columns={0:'mom'}) .loc[:, outFile.columns] ) pidMom = pidMom.filter(['mom']).rename(columns={ 'mom': 'pid' }).drop_duplicates() finaldf = finaldf.append(pidMom, ignore_index=True) #drops any duplicate pids found, will happen more on later generations finaldf = finaldf.drop_duplicates() #insures that scrapeperson always has more than one pid to run on try: if finaldf.count()[0] > 0 and finaldf.count()[0] < 3: finaldf = finaldf.append(finaldf, ignore_index=True) #print(finaldf) except: print("Single household") #appends pids found to the list completeIsland = completeIsland.append( finaldf, ignore_index=True).drop_duplicates() totalSize = completeIsland.count()[0] if size == totalSize or totalSize > 1500: growing = False completeIsland['islandID'] = islandID completeIsland['Count'] = totalSize print("Number of people found: " + str(totalSize)) else: size = totalSize temp = finaldf if size > 9: print("Found " + str(size) + " in the island so far") #something broke so it saves the pids remaining and exports what it has gathered so far except: try: print(sys.exc_info()[0]) sys.exc_info()[2].traceback.print_exception(limit=5) except: pass remainingPids.to_csv(backUpFile, index=False) break finalcsv = finalcsv.append(completeIsland, ignore_index=True) remainingPids = remainingPids[~remainingPids['pid']. isin(completeIsland['pid'])] leftover = remainingPids.count()[0] print(str(leftover) + ' people left of ' + str(total)) islandID += 1 if islandID % 25 == 0: print() t2 = time.time() avg_time = (t2 - t0) / islandID print('Avg Time: ' + str(round(avg_time, 3)) + ' Seconds/Island') print('Est. time remaining: ' + str(round(avg_time * leftover / 3 / 60, 3)) + ' minutes') print() if leftover == 0: finished = True print("Finished!") return finalcsv
import os import pandas as pd import python.datafiles as datafiles data = pd.read_csv("data/workdata_2016.dta") data = pd.read_csv("data/workdata_2016.dta", encoding="iso-8859-1",error_bad_lines=False) data = pd.read_csv("data/workdata_2016.dta", encoding="utf-8") data = pd.read_csv('file1.csv', data = pd.read_stata("data/workdata_2016.dta", encoding="ascii") data = pd.read_stata("data/workdata_2016.dta", encoding="latin-1") # data_recip_1.to_csv( folder + "recip-1/" + name + '.csv')
#!/usr/bin/env python3 # -*- coding: utf-8 -*- """ Created on Mon Dec 18 14:12:49 2017 @author: pradeep """ import numpy as np import pandas as pd import matplotlib as plt cluster_dt = pd.read_stata( '/Users/pradeep/Google Drive ([email protected])/Cluster Paper/Data/Data_MAin_v12.dta' ) cluster_dt.to_pickle('cluster_dt.pkl') cluster_dt = pd.read_pickle('cluster_dt.pkl') #pprint(cluster_dt.columns) #print(cluster_dt.columns) #print(pd.get_option("display.max_columns")) #print(pd.set_option("display.max_columns",999)) #pd.set_option('display.max_columns', None) #cluster_dt.columns #pandas.set_option('display.max_rows', None) cols = list(cluster_dt.columns) df1 = cluster_dt[[ 'Company_Name', 'City', 'PreviousLocation_F1', 'PreviousLocation_F2', 'PreviousLocation_F3', 'PreviousLocation_F4',
# IPPP Final Project - Regression Analysis ### Long-term Health Impact ### # outcome vars are: sick(dummy for person sick), days sick, import pandas as pd import numpy as np import statsmodels.api as sm #read investment data investments = pd.DataFrame( pd.read_stata("investments_data.dta", convert_categoricals=False)) #mask wave 7 to get 2003 data and drop obvs with no consumption data investments = investments.loc[(investments['wave'] == 7) & (investments['consumo_pp_ae'] != 0)] investments.dropna(subset=['consumo_pp_ae'], inplace=True) #mask out 1 (poorest hh) ntile and 200 (richest hh) ntile of consumption iinvestments = investments.loc[(investments['consumo_pp_ae2'] > 22.2066) & (investments['consumo_pp_ae2'] < 1118.53)] #read morbidity data morbidity = pd.DataFrame( pd.read_stata("adults_morbidity_03.dta", convert_categoricals=False)) # create age squared morbidity['age_sq'] = morbidity['age']**2 #sorting the columns for joining morbidity.sort_values(by=['folio', 'state', 'muni', 'local', 'wave'], ascending=[True, True, True, True, True])
def gen_var(agegroup_type_Bailey = True): """Function to create the relevant categorials from scratch. Args: ------- Returns: ------- A data frame containing the variables needed for the analysis. """ wn.filterwarnings("ignore") #1. 1955 GAF #Read dta file 1955 GAF df_55 = pd.read_stata('data/gaf55_1.dta', convert_categoricals = False) #generate dummy: 1 if fertile and 0 if not df_55['_fecund'] = 0 df_55.loc[ (df_55['fec_code'] == 4) | (df_55['fec_code'] == 5), '_fecund' ] = 1 #generate dummy if ever used method of contraception df_55['_everuse'] = 0 df_55.loc[ (df_55['meth_everuse'] == 1), '_everuse' ] = 1 #generate dummy if ever used method of contraception or as douche df_55['_everuse_d'] = 0 df_55.loc[(df_55['meth_everuse'] == 1) | (df_55['meth_everuse'] == 2), '_everuse_d'] = 1 #generate dummy if ever used condom or diaphragma df_55['_barrier'] = 0 df_55.loc[ (df_55['con_use'] == 1) | (df_55['dia_use'] == 1), '_barrier'] = 1 #generate dummy whether either condom, diaphragma, helly, vagsupp, foamtab, #tampon or any other method was ever used df_55['_supplies'] = 0 df_55.loc[ (df_55['con_use']==1)|(df_55['dia_use']==1)|(df_55['jelly_use']==1)|(df_55['vagsupp_use']==1) \ |(df_55['foamtab_use']==1)|(df_55['tampon_use']==1)|(df_55['othmeth_use']==1), '_supplies'] = 1 #generate dummy, if ever used oral contraception (not possible in 1955) df_55['_oral'] = 0 #generate dummy wgt df_55['wgt'] = 1 #generate year df_55['year'] = 1955 #2. 1965 NFS #read .dta file 1965 NFS df_65 = pd.read_stata('data/nfs65_1.dta', convert_categoricals = False) #generate dummy for fertility, one means fertile df_65['_fecund'] = 0 df_65.loc[ (df_65['fecund'] == 8), '_fecund'] = 1 #generate dummy if ever used method of contraception or as douche df_65['_everuse_d'] = 0 df_65.loc[(df_65['preg1_meth1_4'] == 1) | (df_65['preg1_meth1_5'] == 1)\ | (df_65['preg1_meth1_6'] == 1) | (df_65['preg1_meth1_7']==1) \ | (df_65['preg1_meth1_8']==1) |(df_65['preg1_meth1_12']==1) | (df_65['preg1_meth1_9']==1) \ | (df_65['preg1_meth1_10']==1) | (df_65['preg1_meth1_11']==1) |(df_65['preg1_meth1_13']==1) \ | (df_65['preg1_meth1_14']==1)|(df_65['preg1_meth1_15']==1) | (df_65['preg1_meth1_16']==1), '_everuse_d'] = 1 #generate dummy whether a contraception method was used before the first pregnancy df_65['_everuse2_mjb'] = 0 df_65.loc[df_65['preg1_meth1_3'] == 0, '_everuse2_mjb'] = 1 #1 if not used before first pregnancy df_65.loc[df_65['preg1_meth1_2'] == 1, '_everuse2_mjb'] = np.NaN # no answer to question of contraception used before first pregnancy #generate dummy if ever used method of contraception (exclusive douche) df_65['_everuse3_mjb'] = df_65['_everuse_d'] df_65.loc[ (df_65['preg1_meth1_13']==1) & (df_65['preg1_meth1_1']==0) & (df_65['preg1_meth1_2']==0) \ & (df_65['preg1_meth1_3']==0) & (df_65['preg1_meth1_4']==0) & (df_65['preg1_meth1_5']==0) \ & (df_65['preg1_meth1_6']==0) & (df_65['preg1_meth1_7']==0) & (df_65['preg1_meth1_8']==0) \ & (df_65['preg1_meth1_9']==0) & (df_65['preg1_meth1_10']==0) & (df_65['preg1_meth1_11']==0) \ & (df_65['preg1_meth1_12']==0) & (df_65['preg1_meth1_14']==0) & (df_65['preg1_meth1_15']==0) \ & (df_65['preg1_meth1_16'] == 0), '_everuse3_mjb' ] = 0 #genrate dummy if a barrier method (condom, diaphragma) was ever used df_65['_barrier'] = 0 df_65.loc[ (df_65['preg1_meth1_5'] == 1) | (df_65['preg1_meth1_7'] == 1), '_barrier' ] = 1 #generate dummy if any supply methods were ever used (condoms, diaphragma, pill, jelly, foam, douche, IUD, Sponge ) df_65['_supplies'] = 0 df_65.loc[ (df_65['preg1_meth1_5'] == 1) | (df_65['preg1_meth1_7'] == 1)|(df_65['preg1_meth1_8'] == 1) \ | (df_65['preg1_meth1_9'] == 1) | (df_65['preg1_meth1_10'] == 1) | (df_65['preg1_meth1_11']==1) \ | (df_65['preg1_meth1_13']==1) |(df_65['preg1_meth1_14']==1) | (df_65['preg1_meth1_16']==1), '_supplies' ] = 1 #generate dummy if oral contraception was used (pill) df_65['_oral'] = 0 df_65.loc[ df_65['preg1_meth1_8'] == 1, '_oral' ] = 1 #repeat this for any different A-types of pregi_meth1_j for i in range(0,21): df_65.loc[ (df_65['preg{}_meth1_4'.format(i)] == 1) | (df_65['preg{}_meth1_5'.format(i)] == 1) \ | (df_65['preg{}_meth1_6'.format(i)] == 1) | (df_65['preg{}_meth1_7'.format(i)]==1) \ | (df_65['preg{}_meth1_8'.format(i)]==1) |(df_65['preg{}_meth1_12'.format(i)]==1) | (df_65['preg{}_meth1_9'.format(i)]==1) \ | (df_65['preg{}_meth1_10'.format(i)]==1) | (df_65['preg{}_meth1_11'.format(i)]==1) |(df_65['preg{}_meth1_13'.format(i)]==1) \ | (df_65['preg{}_meth1_14'.format(i)]==1)|(df_65['preg{}_meth1_15'.format(i)]==1) | (df_65['preg{}_meth1_16'.format(i)]==1), '_everuse_d'] = 1 df_65.loc[df_65['preg{}_meth1_3'.format(i)] == 0, '_everuse2_mjb'] = 1 df_65['_everuse3_mjb'] = df_65 ['_everuse_d'] df_65.loc[ (df_65['preg{}_meth1_13'.format(i)]==1) & (df_65['preg{}_meth1_1'.format(i)]==0) & (df_65['preg{}_meth1_2'.format(i)]==0) \ & (df_65['preg{}_meth1_3'.format(i)]==0) & (df_65['preg{}_meth1_4'.format(i)]==0) & (df_65['preg{}_meth1_5'.format(i)]==0) \ & (df_65['preg{}_meth1_6'.format(i)]==0) & (df_65['preg{}_meth1_7'.format(i)]==0) & (df_65['preg{}_meth1_8'.format(i)]==0) \ & (df_65['preg{}_meth1_9'.format(i)]==0) & (df_65['preg{}_meth1_10'.format(i)]==0) & (df_65['preg{}_meth1_11'.format(i)]==0) \ & (df_65['preg{}_meth1_12'.format(i)]==0) & (df_65['preg{}_meth1_14'.format(i)]==0) & (df_65['preg{}_meth1_15'.format(i)]==0) \ & (df_65['preg{}_meth1_16'.format(i)] == 0), '_everuse3_mjb' ] = 0 df_65.loc[ (df_65['preg{}_meth1_5'.format(i)] == 1) | (df_65['preg{}_meth1_7'.format(i)] == 1), '_barrier' ] = 1 df_65.loc[ (df_65['preg{}_meth1_5'.format(i)] == 1) | (df_65['preg{}_meth1_7'.format(i)] == 1)|(df_65['preg{}_meth1_8'.format(i)] == 1) \ | (df_65['preg{}_meth1_9'.format(i)] == 1) | (df_65['preg{}_meth1_10'.format(i)] == 1) | (df_65['preg{}_meth1_11'.format(i)]==1) \ | (df_65['preg{}_meth1_13'.format(i)]==1) |(df_65['preg{}_meth1_14'.format(i)]==1) | (df_65['preg{}_meth1_16'.format(i)]==1), '_supplies' ] = 1 df_65.loc[ df_65['preg{}_meth1_8'.format(i)] == 1, '_oral' ] = 1 #print(i) df_65 = df_65[['_barrier', '_supplies', 'preg0_coitfr', '_oral', '_fecund', '_everuse_d','_everuse2_mjb','_everuse3_mjb', \ 'H_emp_inc65', 'fam_idealch', 'att_idealch', 'res_popdens', 'res_reg', 'statefip', 'statename', 'int_num', 'per_race', \ '_age', 'ed_higrade', 'rel_pref', 'dob', 'fec_sterilop1', 'wgt', 'ch_totlb']] df_65['year'] = 1965 #3. 1970 NFS #read .dta file 1970 NFS df_70 = pd.read_stata('data/nfs70_1.dta', convert_categoricals = False) #generate dummy if ever used method of contraception or as douche df_70['_everuse_d'] = 0 df_70.loc[ (df_70['preg1_meth_any'] == 1) | (df_70['dia_everuse'] == 1) | (df_70['iud_everuse'] == 1) \ | (df_70['_bcp_everuse'] == 1), '_everuse_d'] = 1 #genrate dummy if a barrier method (condom, diaphragma) was ever used df_70['_barrier'] = 0 df_70.loc[ (df_70['preg1_meth_1'] == 3) | (df_70['preg1_meth_1'] == 5) | (df_70['dia_everuse'] == 1), '_barrier' ] = 1 #generate dummy if any supply methods were ever used (condoms, diaphragma, pill, jelly, foam, douche, IUD, Sponge ) df_70['_supplies'] = 0 df_70.loc[ (df_70['preg1_meth_1'] == 3) | (df_70['preg1_meth_1'] == 5) | (df_70['preg1_meth_1'] == 6) \ | (df_70['preg1_meth_1'] == 7) | (df_70['preg1_meth_1'] == 8) | (df_70['preg1_meth_1'] == 9) \ | (df_70['preg1_meth_1'] == 10) | (df_70['preg1_meth_1'] == 11) | (df_70['preg1_meth_1'] == 13) \ | (df_70['dia_everuse'] == 1) | (df_70['iud_everuse'] == 1) | (df_70['_bcp_everuse'] == 1), '_supplies'] = 1 #generate dummy if oral contraception was used (pill) df_70['_oral'] = 0 df_70.loc[ (df_70['preg1_meth_1'] == 6) | (df_70['_bcp_everuse'] == 1), '_oral' ] = 1 # rename variable to make it suitable for loop df_70 = df_70.rename(columns={'preg0_meth': 'preg0_meth_1'}) #df_70['preg0_meth_1'] = df_70['preg0_meth'] for i in range(0,20): if i == 0 : j = 1 df_70.loc[ df_70['preg0_meth1'] == 1, '_everuse_d' ] = 1 df_70.loc[ (df_70['preg0_meth_{}'.format(j)] == 3) | (df_70['preg0_meth_{}'.format(j)] == 5), '_barrier'] = 1 df_70.loc[ (df_70['preg0_meth_{}'.format(j)] == 3) | (df_70['preg0_meth_{}'.format(j)] == 5) | (df_70['preg0_meth_{}'.format(j)] == 6) \ | (df_70['preg0_meth_{}'.format(j)] == 7) | (df_70['preg0_meth_{}'.format(j)] == 8) | (df_70['preg0_meth_{}'.format(j)] == 9) \ | (df_70['preg0_meth_{}'.format(j)] == 10) | (df_70['preg0_meth_{}'.format(j)] == 11) | (df_70['preg0_meth_{}'.format(j)] == 13), '_supplies' ] = 1 df_70.loc[ (df_70['preg0_meth_{}'.format(j)] == 6), '_oral' ] = 1 else: df_70.loc[ (df_70['preg{}_meth_any'.format(i)] == 1), '_everuse_d' ] = 1 for j in range(1,7): df_70.loc[ (df_70['preg{}_meth_{}'.format(i,j)] == 3) | (df_70['preg{}_meth_{}'.format(i,j)] == 5), '_barrier'] = 1 df_70.loc[ (df_70['preg{}_meth_{}'.format(i,j)] == 3) | (df_70['preg{}_meth_{}'.format(i,j)] == 5) | (df_70['preg{}_meth_{}'.format(i,j)] == 6) \ | (df_70['preg{}_meth_{}'.format(i,j)] == 7) | (df_70['preg{}_meth_{}'.format(i,j)] == 8) | (df_70['preg{}_meth_{}'.format(i,j)] == 9) \ | (df_70['preg{}_meth_{}'.format(i,j)] == 10) | (df_70['preg{}_meth_{}'.format(i,j)] == 11) | (df_70['preg{}_meth_{}'.format(i,j)] == 13), '_supplies' ] = 1 df_70.loc[df_70['preg{}_meth_{}'.format(i,j)] == 6, '_oral'] = 1 #print(i) df_70 = df_70[['_barrier', '_supplies', '_oral', 'preg0_coitfr', '_everuse_d', 'H_emp_inc70', 'fam_idealch', 'att_idealch', 'statefip', 'statename', 'int_LAN', 'int_LAN_size', 'int_LAN_region', 'int_num', 'per_race', 'wgt', '_age', 'mar_stat', 'ed_total', 'rel_pref', 'dob', 'fec_sterilop1', 'ch_totlb' ]] df_70['year'] = 1970 #append data frames df = df_55 df = df.append(df_65, ignore_index = True) df = df.append(df_70, ignore_index = True) #4. Harmonize definitions of sample variables #Race df['_White'] = 0 df.loc[df['year'] == 1955, '_White'] = 1 df.loc[(df['year'] == 1965) & (df['per_race'] == 1), '_White'] = 1 df.loc[(df['year'] == 1970) & (df['per_race'] == 2), '_White'] = 1 #Marital Status df['_Married'] = 0 df.loc[(df['year'] == 1955) | (df['year'] == 1965), '_Married'] = 1 df.loc[(df['year'] == 1970) & (df['mar_stat'] == 1), '_Married'] = 1 #Education df['_ed_cat'] = np.NaN lis = [0,9,12,13,16,18] df.loc[(df['year'] == 1965) & (df['ed_higrade'] == 0), '_ed_cat'] = 0 for i in range(1,len(lis)): #print(i) df.loc[ (df['year'] == 1965) & ( lis[i-1] <= df['ed_higrade'] ) & (df['ed_higrade'] < lis[i]), '_ed_cat' ] = lis[i-1] df.loc[ (df['ed_total'] == i), 'ed_total'] = lis[i-1] df.loc[ (df['year'] == 1970), '_ed_cat'] = df['ed_total'] df.loc[ (df['ed'] <= 4) & (df['ed'] >= 1), 'ed'] = 0 df.loc[ (df['ed'] == 8) | (df['ed'] == 9),'ed'] = 16 df.loc[ (df['ed'] == 5), 'ed'] = 9 df.loc[ (df['ed'] == 6), 'ed'] = 12 df.loc[ (df['ed'] == 7), 'ed'] = 13 df.loc[ (df['year'] == 1955), '_ed_cat'] = df['ed'] #Catholic df['_Catholic'] = 0 df.loc[ ((df['year'] == 1955) & (df['rel_pref'] == 10)) | ((df['year'] == 1965) & (df['rel_pref'] == 21)) | \ ((df['year'] == 1970) & (df['rel_pref'] == 21)), '_Catholic'] = 1 #Birth Cohort df['_yob'] = np.NaN lis = range(0,870,12) for i in lis: df.loc[ ((df['year'] == 1970) | (df['year'] == 1965)) & (df['dob'] >= i) & (df['dob'] < i + 12), '_yob'] = i /12 + 1900 df.loc[ ((df['year'] == 1970) | (df['year'] == 1965)) & (df['dob'] > 900), '_yob'] = df['dob'] + 1000 df.loc[ (df['year'] == 1955), '_yob' ] = df['dob_y'] + 1900 df['_yobcat'] = np.NaN lis = range(1910,1960, 5) for i in lis: df.loc[ (df['_yob'] >= i) & (df['_yob'] < i + 5), '_yobcat'] = i #age df['_age'] = np.NaN df['_age'] = df['year'] - df['_yob'] df['_agecat'] = np.NaN lis = range(15,60,5) for i in lis: df.loc[ (df['_age'] >= i) & (df['_age'] < i + 5), '_agecat'] = i #Surgically sterilized df['_sterilop'] = 0 df.loc[ (df['year'] == 1955) & (df['fec_ster1'] != 0) & (df['fec_ster1'] != 6), '_sterilop' ] = 1 df.loc[ (df['year'] == 1965) & (df['fec_sterilop1'] == 1), '_sterilop' ] = 1 df.loc[ (df['year'] == 1970) & (df['fec_sterilop1'] == 1), '_sterilop' ] = 1 #ideal number of children for american family df['_att_idealch'] = np.NaN #create variable to standardize variables. df.loc[(df['att_idealch'] < 99) & (df['year'] == 1955), '_att_idealch'] = df['att_idealch']/10 df.loc[ (df['att_idealch'] > 90) & (df['att_idealch'] <= 130) & (df['year'] == 1965), 'att_idealch' ] = 90 #fix coding bug df.loc[ ((df['att_idealch'] <= 90) & (df['year'] == 1965)) \ | ((df['att_idealch'] < 96) & (df['year'] == 1970)), '_att_idealch'] = df['att_idealch']/10 df['_idealcat'] = np.NaN lis = [0,2,3,4,5,10,1000] #1000 can be an arbitrary high value. Just to make loop suitable for i in range(0, (len(lis)-1)): #print(i) df.loc[ (lis[i] <= df['_att_idealch']) & (df['_att_idealch'] < lis[i+1]), '_idealcat' ] = lis[i] #Husbands income df['h_empinc65'] = np.NaN df.loc[ (df['year'] == 1965), 'h_empinc65' ] = df['H_emp_inc65'] df.loc[ (df['h_empinc65'] != 0) & (df['h_empinc65'] < 10), 'h_empinc65' ] = df['h_empinc65']*1000+500 df.loc[(df['h_empinc65'] == 10), 'h_empinc65' ] = 11000 df.loc[(df['h_empinc65'] == 20), 'h_empinc65' ] = 13500 df.loc[ (df['h_empinc65'] == 30), 'h_empinc65' ] = 21000 df.loc[(df['h_empinc65'] == 88) | (df['h_empinc65'] == 99), 'h_empinc65' ] = np.NaN df['h_empinc70'] = np.NaN df.loc[(df['year'] == 1970), 'h_empinc70' ] = df['H_emp_inc70'] df.loc[ (df['h_empinc70'] != 0) & (df['h_empinc70'] < 10), 'h_empinc70' ] = df['h_empinc70']*1000+500 df.loc[ (df['h_empinc70'] == 10), 'h_empinc70' ] = 12000 df.loc[ (df['h_empinc70'] == 11), 'h_empinc70' ] = 13500 df.loc[ (df['h_empinc70'] == 12), 'h_empinc70' ] = 21000 df.loc[ (df['h_empinc70'] == 88) | (df['h_empinc70'] == 99), 'h_empinc70' ] = np.NaN df['h_empinc55'] = np.NaN df.loc[ (df['year'] == 1955), 'h_empinc55' ] = df['h_emp_inc'] df.loc[ (df['h_empinc55'] != 0) & (df['h_empinc55'] < 9), 'h_empinc55' ] = (df['h_empinc55']-1)*1000+500 df.loc[ (df['h_empinc55'] == 9), 'h_empinc55' ] = 9000 df.loc[ (df['h_empinc55'] == 10), 'h_empinc55' ] = 14000 df.loc[ (df['h_empinc55'] == 88) | (df['h_empinc55'] == 99), 'h_empinc55' ] = np.NaN df['_h_empinc'] = np.NaN df.loc[(df['year'] == 1955), '_h_empinc'] = (36.7/26.9)*df['h_empinc55'] #in 1969 US Dollar df.loc[(df['year'] == 1965), '_h_empinc'] = (36.7/31)*df['h_empinc65'] #in 1969 US Dollar df.loc[(df['year'] == 1970), '_h_empinc'] = df['h_empinc70'] #in 1969 US Dollar df['_hinccat'] = np.NaN df.loc[(0 <=df['_h_empinc']) & (df['_h_empinc'] <= 4500), '_hinccat'] = 0 df.loc[(4500 < df['_h_empinc']) & (df['_h_empinc'] <= 6500), '_hinccat'] = 1 df.loc[(6500 < df['_h_empinc']) & (df['_h_empinc'] <= 8400), '_hinccat'] = 2 df.loc[ (8400 < df['_h_empinc']) & (df['_h_empinc'] <= 12000), '_hinccat'] = 3 df.loc[(12000 <= df['_h_empinc']) & (df['_h_empinc'] <= 25000), '_hinccat'] = 4 #Attitudes on family planning df['_approvefp55'] = np.NaN df.loc[(df['year'] == 1955), '_approvefp55'] = 0 df.loc[(df['year'] == 1955) & (df['att_famplan'] <= 2), '_approvefp55' ] = 1 #Region df.loc[( (df['int_LAN_region'] == 1) | (df['int_LAN_region'] == 2) ) & (df['year'] == 1970), 'res_reg' ] = 1 df.loc[( (df['int_LAN_region'] == 3) | (df['int_LAN_region'] == 4) ) & (df['year'] == 1970), 'res_reg' ] = 2 df.loc[( (df['int_LAN_region'] == 5) | (df['int_LAN_region'] == 6) | (df['int_LAN_region'] == 7) ) & (df['year'] == 1970), 'res_reg' ] = 3 df.loc[( (df['int_LAN_region'] == 8) | (df['int_LAN_region'] == 9) ) & (df['year'] == 1970), 'res_reg' ] = 4 df.loc[(df['int_LAN'] >= 54002) & (df['int_LAN'] <= 54013) & (df['year'] == 1970), 'res_reg' ] = 1 df.loc[ (df['statefip'] == 11) & (df['year'] == 1970), 'res_reg' ] = 3 df.loc[ (df['year'] == 1955), 'res_reg' ] = df['_region'] df = df.drop(columns=['_region']) df = df.rename(columns={'res_reg': '_region'}) #see codebokkNFS65 #Live births df.loc[ df['ch_totlb'] > 12, 'ch_totlb' ] = 12 #coital frequency df['_coit_freq'] = np.NaN df.loc[df['preg0_coitfr'] < 98, '_coit_freq'] = df['preg0_coitfr'] #5. Merge to Statenames sice some states have the name with a space ' ' in th beginning sn = pd.read_stata('data/temp.dta', convert_categoricals = False) lis = sn['statefip'] for i in lis: L = len(df['statename'][df['statefip'] == i]) df['statename'][ df['statefip'] == i] = np.repeat(sn['statename'][ sn['statefip'] == i], L) #df.loc[ df['statefip'] == i, 'statename'] = np.repeat(sn['statename'][ sn['statefip'] == i], L) #6. Merge to Griswold Laws gl = pd.read_stata('data/griswoldlaws5.dta', convert_categoricals = False) df = df[df.statefip != 11] #delete columbia df['sales'] = np.NaN df['any'] = np.NaN df['_Phys'] = np.NaN df['cstck'] = np.NaN df['_Phy_LB'] = np.NaN lis = gl['statefip'] indexcol = ['sales', 'any', '_Phys', 'cstck', '_Phy_LB'] for column in indexcol: #print(column) for i in lis: L = len(df[column][df['statefip'] == i]) df[column][df['statefip'] == i] = np.repeat(gl[column][ gl['statefip'] == i], L) df['__yobcat'] = np.NaN if agegroup_type_Bailey is True: lis = range(1910,1950,20) else: lis = range(1910,1960,20) for i in lis: #print(i) df.loc[ (df['_yobcat'] >= i) & (df['_yobcat'] < i + 20), '__yobcat'] = i #maybe error in stata code (values above 1950 are set as missing) return df
import pandas as pd import numpy as np import pickle as pk import math import os anes_data_inp=pd.read_stata("data/ANES_raw2008-1948.dta",convert_categoricals=False) circuit_mapping={ 23:1, 25:1, 33:1, 44:1, 9:2, 36:2, 50:2, 10:3, 34:3, 42:3, 24:4, 37:4, 45:4, 51:4, 54:4, 22:5, 28:5, 48:5, 21:6, 26:6, 39:6, 47:6, 17:7, 18:7, 55:7, 5:8, 19:8, 27:8, 29:8, 31:8, 38:8, 46:8, 2:9, 4:9, 6:9, 15:9, 16:9, 30:9, 32:9, 41:9, 53:9, 8:10, 20:10, 35:10, 40:10, 49:10, 56:10, 1:11, 12:11, 13:11, 11:12 } # Below is the sorted values of the state ID's in the ANES # unique = anes_data_inp['VCF0901A'].unique() # print(sorted([int(x) for x in unique if not math.isnan(x)])) # Print the iloc of the columns of the last 4 instruments # print(anes_data_inp.columns.get_loc("VCF9003")) # print(anes_data_inp.columns.get_loc("VCF9004")) # print(anes_data_inp.columns.get_loc("VCF9005")) # print(anes_data_inp.columns.get_loc("VCF9006"))
# Import pandas import pandas as pd import matplotlib.pyplot as plt stata_file = './OtherFiles/disarea.dta' # Load Stata file into a pandas DataFrame: df df = pd.read_stata(stata_file) # Print the head of the DataFrame df print(df.head()) # Plot histogram of one column of the DataFrame pd.DataFrame.hist(df[['disa10']]) plt.xlabel('Extent of disease') plt.ylabel('Number of countries') plt.show()
import pandas as pd hhCompLvl1 = pd.read_stata('./Datasets/HouseholdComposition_level_1.dta') hhCompLvl2 = pd.read_stata('./Datasets/HouseholdComposition_members_level_2.dta') fDiaryLvl1 = pd.read_stata('./Datasets/33.FoodDiary_level_1.dta').sort_values(by=['hh_ID']).reset_index(drop=True) fDiaryLvl2 = pd.read_stata('./Datasets/34.FoodDiary_food_level_2.dta').sort_values(by=['hh_ID', 'food_1_ID', 'food_2_ID']) fDiaryLvl3 = pd.read_stata('./Datasets/35.Fooddiary_food_foodtype_level_3.dta') # Reading in the data from stata format into a pandas data frame fDiaryLvl1.drop(['submissiondate', 'start_time', 'end_time', 'today', 'start_date', 'expiration_date', 'max_submissions', 'task_value', 'task_length'], axis=1, inplace=True) fDiaryLvl2.drop(['food_label'], axis=1, inplace=True) fDiaryLvl3.drop(['food_type_label', 'food_unit_label'], axis=1, inplace=True) # Removing irrelevant information to this project fDiary1Link2 = fDiaryLvl1.merge(fDiaryLvl2) fDiary2Link3 = fDiary1Link2.merge(fDiaryLvl3).sort_values(by=['hh_ID', 'week_number']).reset_index(drop=True) # Linking the food diary levels using the unique ID numbers # fDiary2Link3.drop(fDiary2Link3.ix([:,'']) # Deleting a range of columns householdIDLinked = pd.Series(fDiaryLvl1.hh_ID.unique()).sort_values() weekNumbers = pd.Series(fDiaryLvl1.week_number.unique()).sort_values() unitsUsed = pd.Series(fDiaryLvl3.food_type_unit.unique()) foodsList = pd.DataFrame(fDiary2Link3.food_type_name.unique(), columns=['food_type']) # The list of different foods which will be used to calculate calories foodGroups = pd.Series(fDiary2Link3.food_grp_namec.unique())
ucid2nsrc[ucid] = [name, stid, rid, cid] files = [] with open(merged_data_files, 'r') as f: for file in f: file_path = file.splitlines()[0] if not os.path.exists(file_path): print('File %s does not exist!' % file_path) exit() files.append(file_path) for merged_data_file in files: if merged_data_file.endswith('dta'): fname_wo_ext = merged_data_file.split(".dta")[0] print("Loading dta file " + merged_data_file) data = pd.read_stata(merged_data_file) data_reader = pd.read_stata(merged_data_file, iterator=True) # Deal with encoding problems in data and value labels of Stata files for col in data.select_dtypes(include=object).columns: data[col] = data[col].str.encode('utf-8').str.decode( 'ascii', 'ignore') # data[col] = data[col].str.replace("\u2019", "'") # data[col] = data[col].str.replace("\uff0c", ",") # data[col] = data[col].str.replace("\uff01", "!") value_labels = {} for var in data_reader.value_labels().keys(): var_recode = var.encode('utf-8').decode('latin-1', 'ignore') value_labels[var_recode] = {} for val in data_reader.value_labels()[var].keys():
from data_functions_albert import remove_outliers os.chdir( 'C:/Users/rodri/OneDrive/Documentos/IDEA/Master tesi/Data & Code (Uganda)/data10' ) pd.options.display.float_format = '{:,.2f}'.format dollars = 2586.89 # AGRICULTURAL SEASON 1: #Omit rents for evaluate agricultural productivity. # ============================================================================= # Fertilizers & labor inputs # ============================================================================= ag3a = pd.read_stata('agsec3a.dta') ag3a = ag3a[[ "HHID", 'pltid', "a3aq8", "a3aq19", 'a3aq31', 'a3aq38', 'a3aq39', 'a3aq42a', 'a3aq42b', 'a3aq42c' ]] ag3a['hhlabor'] = ag3a["a3aq39"].fillna(0) ag3a['hired_labor'] = ag3a["a3aq42a"].fillna(0) + ag3a["a3aq42b"].fillna( 0 ) + ag3a["a3aq42c"].fillna( 0 ) #Sum over hours men, women and kids. We assume all of them equally productive. ag3a = ag3a[[ "HHID", 'pltid', "a3aq8", "a3aq19", "a3aq31", 'hhlabor', 'hired_labor' ]] ag3a.columns = [
%matplotlib inline import matplotlib.pyplot as plt import numpy as np import pymc3 as pm import pandas as pd import theano import seaborn as sns import os os.chdir("/Users/alexwolf/Dropbox/Uni/WB RS Estimates/4. Estimation/rsest/") "--- Read Data ---" path = "../../3. Data/Combined_Sample.dta" Data = pd.read_stata(path) Data[['wavecode', 'w_clothm', 'ltotR']] = Data[['wavecode', 'w_clothm', 'ltotR']].round(4) Data[['wavecode', 'w_clothm', 'ltotR']].head() wave_idx = Data['wavecode'] clothm_mean = np.squeeze(np.mean(Data[['w_clothm']])) with pm.Model() as hierarchical_model: # Hyperpriors mu_a = pm.Normal('mu_alpha', mu=clothm_mean, sd=10) sigma_a = pm.HalfCauchy('sigma_alpha', beta=2) mu_b = pm.Normal('mu_beta', mu=0., sd=10) sigma_b = pm.HalfCauchy('sigma_beta', beta=2) # Intercept for each county, distributed around group mean mu_a
import scipy import numpy as np import pandas as pd try: import SparseSC as SC except ImportError: raise RuntimeError("SparseSC is not installed. Use 'pip install -e .' or 'conda develop .' from repo root to install in dev mode") random.seed(12345) np.random.seed(101101001) pkl_file = "../replication/smoking_fits.pkl" smoking_df = pd.read_stata("../replication/smoking.dta") smoking_df['year'] = smoking_df['year'].astype('int') smoking_df = smoking_df.set_index(['state', 'year']) Y = smoking_df[['cigsale']].unstack('year') T0 = 19 i_t = 2 #unit 3, but zero-index treated_units = [i_t] control_units = [u for u in range(Y.shape[0]) if u not in treated_units] Y_names = Y.columns.get_level_values('year') Y_pre_names = ["cigsale(" + str(i) + ")" for i in Y_names[:T0]] print(Y.isnull().sum().sum()) #0 Y = Y.values T = Y.shape[1] T1 = T-T0
with SAS7BDAT('sales.sas7bdat') as file: df_sas = file.to_data_frame() # Print head of DataFrame print(df_sas.head()) # Plot histogram of DataFrame features (pandas and pyplot already imported) pd.DataFrame.hist(df_sas[['P']]) plt.ylabel('count') plt.show() # Import pandas # import pandas as pd # Load Stata file into a pandas DataFrame: df df = pd.read_stata('disarea.dta') # Print the head of the DataFrame df print(df.head()) # Plot histogram of one column of the DataFrame pd.DataFrame.hist(df[['disa10']]) plt.xlabel('Extent of disease') plt.ylabel('Number of countries') plt.show() # Import packages import numpy as np import h5py # Assign filename: file
import pandas as pd from idcempy import gimnl import os import unittest import numpy as np """ Replication of results from Bagozzi, Benjamin E. and Kathleen Marchetti. 2017. "Distinguishing Occasional Abstention rom Routine Indifference in Models of Vote Choice." Political Science Research and Methods. 5(2): 277-249. """ DAT = pd.read_stata( os.getcwd() + "/data/replicationdata.dta", convert_categoricals=False) x = ['educ', 'female', 'black', 'hispanic', 'party7', 'w3gbvalu2', 'presbatt', 'south', 'gayban2', 'agegroup2', 'bornagn_w', 'brnag_wXgmb2', 'catholic', 'cathXgmb2', 'other_rel', 'secular', 'secXgmb2', 'ideo', 'w3mobidx'] z = ['educ', 'agegroup2', 'w3mobidx', 'secular'] y = ['vote_turn'] order_Kerry = [0, 1, 2] order_Bush = [0, 2, 1] binflatecat = "baseline" sinflatecat = "second" tinflatecat = "third"
from pyreghdfe import Pyreghdfe from sklearn.datasets import load_boston from utils import sklearn_to_df import pandas as pd import numpy as np from utils import get_np_columns import statsmodels.formula.api as smf import econtools import econtools.metrics as mt import statsmodels.api as sm df = pd.read_stata('data/cleaned_nlswork.dta') df['hours_log'] = np.log(df['hours']) # sanity check: cluster standard errors without absorption #results = smf.ols(formula='wks_work~1', data=df).fit() # y = np.asarray([[-100, 100, -100, 1234]]).T # X = np.asarray([[1,2 ,3, 4], [1, 1, 1, 1]]).T # results = sm.OLS(y, X).fit() #results = smf.ols(formula='ttl_exp~wks_ue', data=df).fit() # results = results.get_robustcov_results(cov_type='cluster', # groups=get_np_columns(df, ['idcode'])) # print(np.sum(results.resid)) # import pdb; pdb.set_trace() pyreghdfe = Pyreghdfe(df=df, target='hours_log', predictors=['union'], ids=['year'], cluster_ids=['idcode']) results = pyreghdfe.fit(cov_type='cluster', groups=['idcode'])
# Import required packages #------------------------------------------------------------------------------ import pandas as pd # will be used to read .dta file by .read_stata() import numpy as np import matplotlib.pyplot as plt # will be used to see the obvious relationship of desired variables in a scatterplot #%matplotlib inline import math # to use in some equations from scipy.optimize import minimize # for optimization of Likelihood function (MLE method) import scipy.optimize as opt import statsmodels.api as statmod import scipy.stats as stats #------------------------------------------------------------------------------ # Read data and prepare to utilize (Part 1 & 2) #------------------------------------------------------------------------------ df1 = pd.read_stata('PS3_data.dta') # My note for later: https://www.shanelynn.ie/using-pandas-dataframe-creating-editing-viewing-data-in-python/ print("Dataframe: ") df1.head() ''' hlabinc = annual labor income of the head hannhrs = annual hours of the head hsex = gender of the head (1 = Male, 2 = Female) hrace = race of the head (1 = white, 2 = Black, 3 = Native American, 4 = Asian/Pacific Islander, 5 = Hispanic, 6,7 = Other) age = age of the head hyrsed = years of education of the head ''' print("Data Statistics:") df1.describe() print( "Scatterplot between annual labor inome of the head and years of education of the head"
"""This function cleans the data from the chs_data.dta. It prepares the dataset we use for further analysis. """ import numpy as np import pandas as pd from bld.project_paths import project_paths_join as ppj def save_data(sample): """ Save clean data as .csv file. """ sample.to_csv(ppj("OUT_DATA", "data_clean.csv"), sep=",") if __name__ == "__main__": data = pd.read_stata(ppj("IN_DATA", "chs_data.dta")) data.replace(-100, np.nan, inplace=True) data = data.query("age == 0") data.reset_index(inplace=True) data = data["weightbirth"] data.fillna(data.mean(), inplace=True) save_data(data)
import pandas as pd import matplotlib.pyplot as plt import numpy as np df = pd.read_stata('C:/Users/林子晴/Desktop/dataset.dta') df = df[df.taxeringsvrde > 0] df = df[df.pris > 0] df = df[df.taxr == 2006] df = df[df.fakod == 1] df = df.sort_values("kontraktsdatum", ascending=True) month = df.groupby(df['month']) year = df.groupby(df['year_nr']) def get_year_price(i): return year.get_group(i)['pris'] def get_year_day(i): return year.get_group(i)['kontraktsdatum'] def get_day(k): return month.get_group(k)['day_nr'] def get_date(k): return month.get_group(k)['kontraktsdatum']
def _get_data(): filepath = dirname(abspath(__file__)) filename = join(filepath, 'credit_cards.dta') data = read_stata(filename) data = du.strip_column_names(data) return data
#!/usr/bin/env python3 # -*- coding: utf-8 -*- """ Created on Sat Oct 20 10:31:13 2018 @author: noviayou """ import numpy as np import pandas as pd d1 = pd.read_stata('cds.dta') d2 = pd.read_csv('WRDS.csv') d2.columns = map(str.lower, d2.columns) d1.gvkey = d1.gvkey.astype('int') d2['mdate'] = d2['datadate'].apply( lambda x: pd.to_datetime(str(x), format='%Y%m%d')) d1['year'] = pd.DatetimeIndex(d1['mdate']).year d1['month'] = pd.DatetimeIndex(d1['mdate']).month d2['year'] = pd.DatetimeIndex(d2['mdate']).year d2['month'] = pd.DatetimeIndex(d2['mdate']).month d2.month.unique() #d2['quarter']= d2.apply(lambda x : 1 if d2['month'] == (2,3) else ) # #def type(i):
inplace=True) mergecat_sele_qid_location['qid_cat'] = mergecat_sele_qid_location[ 'qid_cat'].fillna(0) mergecat_sele_qid_location[['qid_cat' ]] = mergecat_sele_qid_location[['qid_cat' ]].astype(int) mergecat_sele_qid_location = mergecat_sele_qid_location.replace(0, np.nan) ###################step2:merge with country and get country information####################### qid_MtCountry = qid_MtCountry.loc[:, ['qid_cat', 'qid_country', 'subject']] qid_MtCountry.drop_duplicates(inplace=True) mergecat_sele_qid_location_WP = pd.merge(mergecat_sele_qid_location, qid_MtCountry, on='qid_cat', how='left') A1_05_03_mapCountry = pd.read_stata(material_path + 'material/A1_05_03_mapCountry.dta') A1_05_03_mapCountry.rename(columns={'location': 'L_or_P'}, inplace=True) mergecat_sele_qid_location_WP = pd.merge(mergecat_sele_qid_location_WP, A1_05_03_mapCountry, on='L_or_P', how='left') #get country information from "location" mergecat_sele_qid_location_WP['locationLabel'] = mergecat_sele_qid_location_WP[ 'locationLabel'].str.replace(" ", "_") A1_05_03_mapCountry.rename(columns={'L_or_P': 'locationLabel'}, inplace=True) A1_05_03_mapCountry.rename( columns={'country': 'country_locationLabel_bselect'}, inplace=True) mergecat_sele_qid_location_WP = pd.merge(mergecat_sele_qid_location_WP, A1_05_03_mapCountry, on='locationLabel',