def main(): #### DEFINE VARIABLES floor_recode_df, meta = pyreadstat.read_dta('./floor_recode.dta') toilet_recode_df, meta = pyreadstat.read_dta('./toilet_recode.dta') water_recode_df, meta = pyreadstat.read_dta('./water_recode.dta') floor_recode_dict = floor_recode_df.set_index( 'floor_code').to_dict()['floor_qual'] toilet_recode_dict = toilet_recode_df.set_index( 'toilet_code').to_dict()['toilet_qual'] water_recode_dict = water_recode_df.set_index( 'water_code').to_dict()['water_qual'] water_recode_dict[63] = 4 country_codes = requests.get( 'http://api.dhsprogram.com/rest/dhs/countries?returnFields=CountryName,DHS_CountryCode' ).text country_code_dict = {} for country in json.loads(country_codes)['Data']: country_code_dict[ country['DHS_CountryCode']] = country['CountryName'].replace( ' ', '_') features = [ 'WATER_SOURCE', 'TOILET_TYPE', 'ELECTRICITY', 'RADIO', 'TV', 'FRIDGE', 'MOTORCYCLE', 'CAR', 'FLOOR', 'ROOMSPP', 'PHONE', 'CELLPHONE' ] # ,'cellphone'] info = ['HHID', 'YEAR'] dhs_survey_path = './dhs_household_data/' wealth_path = './dhs_wealth/' shape_path = './geo_data/' dhs_surveys = [ file for file in os.listdir('./dhs_household_data/') if file.endswith('.SAS7BDAT') ] geo_wealth_path = './gps_csv/' sustainlab_group_file = 'dhs_clusters_sustainlab_group.csv' # initiate Class Object dhs_obj = DHS_preparation(floor_recode=floor_recode_dict, toilet_recode=toilet_recode_dict, water_recode=water_recode_dict, country_code_dict=country_code_dict, features=features, info=info, dhs_survey_path=dhs_survey_path, wealth_path=wealth_path, shape_path=shape_path, geo_wealth_path=geo_wealth_path, sustainlab_group_file=sustainlab_group_file) # perform dhs preparation # calculate wealth index and create csv files per country_year for filename in os.listdir(dhs_obj.dhs_survey_path): dhs_obj.recode_and_format_dhs(filename) # add geo data and create new files for shapefile in os.listdir(shape_path): dhs_obj.create_wealth_geo_df(shapefile) # split sustainlab clusters to files per country_year dhs_obj.split_sustainlab_clusters()
def sysuse_file(dir: str): try: self.data, self.meta = pyreadstat.read_dta(dir) print('(' + self.meta.file_label + ')') self.globals['dir'] = dir except pyreadstat._readstat_parser.ReadstatError: print_red('file \"%s\" not found' % dir)
def get_data(df_url, filepath, variables): survey_obj = get_survey_info(df_url, filepath) # read stata or sas file if filepath.lower().endswith('.dta'): df, meta = pyreadstat.read_dta(filepath, apply_value_formats=True, usecols=variables) elif filepath.lower().endswith('.sas7bdat'): df, meta = pyreadstat.read_sas7bdat(filepath, usecols=variables) df['survey'] = survey_obj['survey'] df.columns = df.columns.str.upper() # change the variable names for household number, cluster number and line number # so it matches with the household memeber module # HV001, HV002 and HVIDX are the variables we join different modules on if survey_obj['module'] == 'Individual Recode': df.rename(columns={ 'V001': 'HV001', 'V002': 'HV002', 'V003': 'HVIDX' }, inplace=True) elif survey_obj['module'] == "Men's Recode": df.rename(columns={ 'MV001': 'HV001', 'MV002': 'HV002', 'MV003': 'HVIDX' }, inplace=True) # store the survey infomation in the metadata file as well meta_dict = meta.column_names_to_labels df_meta = (pd.DataFrame( list(meta_dict.items()), columns=['var', 'label']).assign(module=survey_obj['module']).assign( survey=survey_obj['survey']).assign(ISO=survey_obj['iso']).assign( year=survey_obj['year'])) return [df, df_meta]
def parse_file(filename): if "sav" in filename.lower(): df, meta = pyreadstat.read_sav(filename, apply_value_formats=True, metadataonly=True) return df, meta, False elif "por" in filename.lower(): df, meta = pyreadstat.read_por(filename, apply_value_formats=True, metadataonly=True) return df, meta, False elif "sas7bdat" in filename.lower(): df, meta = pyreadstat.read_sas7bdat(filename, metadataonly=True) return df, meta, False elif "xpt" in filename.lower(): df, meta = pyreadstat.read_xport(filename, metadataonly=True) return df, meta, False elif "dta" in filename.lower(): df, meta = pyreadstat.read_dta(filename, apply_value_formats=True, metadataonly=True) return df, meta, False else: return None, None, True
# apply the formats from the beginning nls97spss, metaspss = pyreadstat.read_sav('data/nls97.sav', apply_value_formats=True, formats_as_category=True) nls97spss.columns = metaspss.column_labels nls97spss.columns = nls97spss.columns.\ str.lower().\ str.replace(' ','_').\ str.replace('[^a-z0-9_]', '') nls97spss.dtypes nls97spss.head() nls97spss.govt_responsibility__provide_jobs_2006.\ value_counts(sort=False) nls97spss.set_index('pubid__yth_id_code_1997', inplace=True) # do the same for stata data nls97stata, metastata = pyreadstat.read_dta('data/nls97.dta', apply_value_formats=True, formats_as_category=True) nls97stata.columns = metastata.column_labels nls97stata.columns = nls97stata.columns.\ str.lower().\ str.replace(' ','_').\ str.replace('[^a-z0-9_]', '') nls97stata.dtypes nls97stata.head() nls97stata.govt_responsibility__provide_jobs_2006.\ value_counts(sort=False) nls97stata.min() nls97stata.replace(list(range(-9,0)), np.nan, inplace=True) nls97stata.min() nls97stata.set_index('pubid__yth_id_code_1997', inplace=True) # pull sas data, using the sas catalog file for value labels
def read(self): engine = sa.create_engine(self.db) conn = engine.connect() print('---- begin >' + str(datetime.datetime.now())) print('----- read >' + str(datetime.datetime.now())) df, meta = pyreadstat.read_dta(self.filename) df.to_sql("plss_tmp", con=engine, if_exists="replace") print('---- merge >' + str(datetime.datetime.now())) t = text(""" insert into plss ( CourseParticipationId,CourseCalendarId,ApplicationId,DateStartCourse,DateActualFinishCourse, OutcomeId,DateDeleted,OutcomeStatusId,OutcomeCertificationId,OutcomeCertificationAwardId, OutcomeEarlyFinishReasonId,OriginalOutcomeId,ApplicationOriginId,ApplicationStatusId,FETProviderId, DeliveryFETProviderId,ProgrammeId,TargetAwardId,PublishedCourseTitle,ProgrammeCategoryId, DateActualStart,DateActualFinish,DeliveryModeId,Title,Cluster,ProgrammeAwardLevel, ProgrammeCertified,TargetAward,AwardAchievable,AwardSummary,ISCEDBroadUID,ISCEDDetailedFieldID, learnerfinishdate,learnerstartyear,finishmonth,finishyear,Gender,ParentFETProviderId,Parent, ParentDivisionProviderId,FETProviderTypeId,FETProviderName,IsETB,CountyId, programmecategorydescription,hasemployers,LearnerCountyId,NationalityId,outcomestatusdescription, outcomecertificationdescription,isoutcomecertified,outcomecertificationawarddescrip, outcomedescription,hash_ppsn,age ) select te.CourseParticipationId,te.CourseCalendarId,te.ApplicationId,te.DateStartCourse,te.DateActualFinishCourse, te.OutcomeId,te.DateDeleted,te.OutcomeStatusId,te.OutcomeCertificationId,te.OutcomeCertificationAwardId, te.OutcomeEarlyFinishReasonId,te.OriginalOutcomeId,te.ApplicationOriginId,te.ApplicationStatusId,te.FETProviderId, te.DeliveryFETProviderId,te.ProgrammeId,te.TargetAwardId,te.PublishedCourseTitle,te.ProgrammeCategoryId, te.DateActualStart,te.DateActualFinish,te.DeliveryModeId,te.Title,te.Cluster,te.ProgrammeAwardLevel, te.ProgrammeCertified,te.TargetAward,te.AwardAchievable,te.AwardSummary,te.ISCEDBroadUID,te.ISCEDDetailedFieldID, te.learnerfinishdate,te.learnerstartyear,te.finishmonth,te.finishyear,te.Gender,te.ParentFETProviderId,te.Parent, te.ParentDivisionProviderId,te.FETProviderTypeId,te.FETProviderName,te.IsETB,te.CountyId, te.programmecategorydescription,te.hasemployers,te.LearnerCountyId,te.NationalityId,te.outcomestatusdescription, te.outcomecertificationdescription,te.isoutcomecertified,te.outcomecertificationawarddescrip, te.outcomedescription,te.hash_ppsn,te.age from plss_tmp te left join plss pl on te.hash_ppsn = pl.hash_ppsn and te.CourseParticipationId = pl.CourseParticipationId and te.CourseCalendarId = pl.CourseCalendarId and te.ApplicationId = pl.ApplicationId and te.DateStartCourse = pl.DateStartCourse and (te.DateActualFinishCourse = pl.DateActualFinishCourse or (te.DateActualFinishCourse is null and pl.DateActualFinishCourse is null)) and (te.OutcomeId = pl.OutcomeId or (te.OutcomeId is null and pl.OutcomeId is null)) and (te.DateDeleted = pl.DateDeleted or (te.DateDeleted is null and pl.DateDeleted is null)) and (te.OutcomeStatusId = pl.OutcomeStatusId or (te.OutcomeStatusId is null and pl.OutcomeStatusId is null)) and (te.OutcomeCertificationId = pl.OutcomeCertificationId or (te.OutcomeCertificationId is null and pl.OutcomeCertificationId is null)) and (te.OutcomeCertificationAwardId = pl.OutcomeCertificationAwardId or (te.OutcomeCertificationAwardId is null and pl.OutcomeCertificationAwardId is null)) and (te.OutcomeEarlyFinishReasonId = pl.OutcomeEarlyFinishReasonId or (te.OutcomeEarlyFinishReasonId is null and pl.OutcomeEarlyFinishReasonId is null)) and (te.OriginalOutcomeId = pl.OriginalOutcomeId or (te.OriginalOutcomeId is null and pl.OriginalOutcomeId is null)) and (te.ApplicationOriginId = pl.ApplicationOriginId or (te.ApplicationOriginId is null and pl.ApplicationOriginId is null)) and (te.ApplicationStatusId = pl.ApplicationStatusId or (te.ApplicationStatusId is null and pl.ApplicationStatusId is null)) and (te.FETProviderId = pl.FETProviderId or (te.FETProviderId is null and pl.FETProviderId is null)) and (te.DeliveryFETProviderId = pl.DeliveryFETProviderId or (te.DeliveryFETProviderId is null and pl.DeliveryFETProviderId is null)) and (te.ProgrammeId = pl.ProgrammeId or (te.ProgrammeId is null and pl.ProgrammeId is null)) and (te.TargetAwardId = pl.TargetAwardId or (te.TargetAwardId is null and pl.TargetAwardId is null)) and (te.PublishedCourseTitle = pl.PublishedCourseTitle or (te.PublishedCourseTitle is null and pl.PublishedCourseTitle is null)) and (te.ProgrammeCategoryId = pl.ProgrammeCategoryId or (te.ProgrammeCategoryId is null and pl.ProgrammeCategoryId is null)) and (te.DateActualStart = pl.DateActualStart or (te.DateActualStart is null and pl.DateActualStart is null)) and (te.DateActualFinish = pl.DateActualFinish or (te.DateActualFinish is null and pl.DateActualFinish is null)) and (te.DeliveryModeId = pl.DeliveryModeId or (te.DeliveryModeId is null and pl.DeliveryModeId is null)) and te.Title = pl.Title and (te.Cluster = pl.Cluster or (te.Cluster is null and pl.Cluster is null)) and (te.ProgrammeAwardLevel = pl.ProgrammeAwardLevel or (te.ProgrammeAwardLevel is null and pl.ProgrammeAwardLevel is null)) and (te.ProgrammeCertified = pl.ProgrammeCertified or (te.ProgrammeCertified is null and pl.ProgrammeCertified is null)) and (te.TargetAward = pl.TargetAward or (te.TargetAward is null and pl.TargetAward is null)) and (te.AwardAchievable = pl.AwardAchievable or (te.AwardAchievable is null and pl.AwardAchievable is null)) and (te.AwardSummary = pl.AwardSummary or (te.AwardSummary is null and pl.AwardSummary is null)) and (te.ISCEDBroadUID = pl.ISCEDBroadUID or (te.ISCEDBroadUID is null and pl.ISCEDBroadUID is null)) and (te.ISCEDDetailedFieldID = pl.ISCEDDetailedFieldID or (te.ISCEDDetailedFieldID is null and pl.ISCEDDetailedFieldID is null)) and (te.learnerfinishdate = pl.learnerfinishdate or (te.learnerfinishdate is null and pl.learnerfinishdate is null)) and (te.learnerstartyear = pl.learnerstartyear or (te.learnerstartyear is null and pl.learnerstartyear is null)) and (te.finishmonth = pl.finishmonth or (te.finishmonth is null and pl.finishmonth is null)) and (te.finishyear = pl.finishyear or (te.finishyear is null and pl.finishyear is null)) and (te.Gender = pl.Gender or (te.Gender is null and pl.Gender is null)) and (te.ParentFETProviderId = pl.ParentFETProviderId or (te.ParentFETProviderId is null and pl.ParentFETProviderId is null)) and (te.Parent = pl.Parent or (te.Parent is null and pl.Parent is null)) and (te.ParentDivisionProviderId = pl.ParentDivisionProviderId or (te.ParentDivisionProviderId is null and pl.ParentDivisionProviderId is null)) and (te.FETProviderTypeId = pl.FETProviderTypeId or (te.FETProviderTypeId is null and pl.FETProviderTypeId is null)) and (te.FETProviderName = pl.FETProviderName or (te.FETProviderName is null and pl.FETProviderName is null)) and (te.IsETB = pl.IsETB or (te.IsETB is null and pl.IsETB is null)) and (te.CountyId = pl.CountyId or (te.CountyId is null and pl.CountyId is null)) and (te.programmecategorydescription = pl.programmecategorydescription or (te.programmecategorydescription is null and pl.programmecategorydescription is null)) and (te.hasemployers = pl.hasemployers or (te.hasemployers is null and pl.hasemployers is null)) and (te.LearnerCountyId = pl.LearnerCountyId or (te.LearnerCountyId is null and pl.LearnerCountyId is null)) and (te.NationalityId = pl.NationalityId or (te.NationalityId is null and pl.NationalityId is null)) and (te.outcomestatusdescription = pl.outcomestatusdescription or (te.outcomestatusdescription is null and pl.outcomestatusdescription is null)) and (te.outcomecertificationdescription = pl.outcomecertificationdescription or (te.outcomecertificationdescription is null and pl.outcomecertificationdescription is null)) and (te.isoutcomecertified = pl.isoutcomecertified or (te.isoutcomecertified is null and pl.isoutcomecertified is null)) and (te.outcomecertificationawarddescrip = pl.outcomecertificationawarddescrip or (te.outcomecertificationawarddescrip is null and pl.outcomecertificationawarddescrip is null)) and (te.outcomedescription = pl.outcomedescription or (te.outcomedescription is null and pl.outcomedescription is null)) and (te.age = pl.age or (te.age is null and pl.age is null)) where pl.hash_ppsn is null """) conn.execute(t) print('------ add >' + str(datetime.datetime.now())) t = text(""" delete from plss where id in (select pl.id from plss pl left join plss_tmp te on te.hash_ppsn = pl.hash_ppsn and (te.CourseParticipationId = pl.CourseParticipationId or (te.CourseParticipationId is null and pl.CourseParticipationId is null)) and (te.CourseCalendarId = pl.CourseCalendarId or (te.CourseCalendarId is null and pl.CourseCalendarId is null)) and (te.ApplicationId = pl.ApplicationId or (te.ApplicationId is null and pl.ApplicationId is null)) and (te.DateStartCourse = pl.DateStartCourse or (te.DateStartCourse is null and pl.DateStartCourse is null)) and (te.DateActualFinishCourse = pl.DateActualFinishCourse or (te.DateActualFinishCourse is null and pl.DateActualFinishCourse is null)) and (te.OutcomeId = pl.OutcomeId or (te.OutcomeId is null and pl.OutcomeId is null)) and (te.DateDeleted = pl.DateDeleted or (te.DateDeleted is null and pl.DateDeleted is null)) and (te.OutcomeStatusId = pl.OutcomeStatusId or (te.OutcomeStatusId is null and pl.OutcomeStatusId is null)) and (te.OutcomeCertificationId = pl.OutcomeCertificationId or (te.OutcomeCertificationId is null and pl.OutcomeCertificationId is null)) and (te.OutcomeCertificationAwardId = pl.OutcomeCertificationAwardId or (te.OutcomeCertificationAwardId is null and pl.OutcomeCertificationAwardId is null)) and (te.OutcomeEarlyFinishReasonId = pl.OutcomeEarlyFinishReasonId or (te.OutcomeEarlyFinishReasonId is null and pl.OutcomeEarlyFinishReasonId is null)) and (te.OriginalOutcomeId = pl.OriginalOutcomeId or (te.OriginalOutcomeId is null and pl.OriginalOutcomeId is null)) and (te.ApplicationOriginId = pl.ApplicationOriginId or (te.ApplicationOriginId is null and pl.ApplicationOriginId is null)) and (te.ApplicationStatusId = pl.ApplicationStatusId or (te.ApplicationStatusId is null and pl.ApplicationStatusId is null)) and (te.FETProviderId = pl.FETProviderId or (te.FETProviderId is null and pl.FETProviderId is null)) and (te.DeliveryFETProviderId = pl.DeliveryFETProviderId or (te.DeliveryFETProviderId is null and pl.DeliveryFETProviderId is null)) and (te.ProgrammeId = pl.ProgrammeId or (te.ProgrammeId is null and pl.ProgrammeId is null)) and (te.TargetAwardId = pl.TargetAwardId or (te.TargetAwardId is null and pl.TargetAwardId is null)) and (te.PublishedCourseTitle = pl.PublishedCourseTitle or (te.PublishedCourseTitle is null and pl.PublishedCourseTitle is null)) and (te.ProgrammeCategoryId = pl.ProgrammeCategoryId or (te.ProgrammeCategoryId is null and pl.ProgrammeCategoryId is null)) and (te.DateActualStart = pl.DateActualStart or (te.DateActualStart is null and pl.DateActualStart is null)) and (te.DateActualFinish = pl.DateActualFinish or (te.DateActualFinish is null and pl.DateActualFinish is null)) and (te.DeliveryModeId = pl.DeliveryModeId or (te.DeliveryModeId is null and pl.DeliveryModeId is null)) and (te.Title = pl.Title or (te.Title is null and pl.Title is null)) and (te.Cluster = pl.Cluster or (te.Cluster is null and pl.Cluster is null)) and (te.ProgrammeAwardLevel = pl.ProgrammeAwardLevel or (te.ProgrammeAwardLevel is null and pl.ProgrammeAwardLevel is null)) and (te.ProgrammeCertified = pl.ProgrammeCertified or (te.ProgrammeCertified is null and pl.ProgrammeCertified is null)) and (te.TargetAward = pl.TargetAward or (te.TargetAward is null and pl.TargetAward is null)) and (te.AwardAchievable = pl.AwardAchievable or (te.AwardAchievable is null and pl.AwardAchievable is null)) and (te.AwardSummary = pl.AwardSummary or (te.AwardSummary is null and pl.AwardSummary is null)) and (te.ISCEDBroadUID = pl.ISCEDBroadUID or (te.ISCEDBroadUID is null and pl.ISCEDBroadUID is null)) and (te.ISCEDDetailedFieldID = pl.ISCEDDetailedFieldID or (te.ISCEDDetailedFieldID is null and pl.ISCEDDetailedFieldID is null)) and (te.learnerfinishdate = pl.learnerfinishdate or (te.learnerfinishdate is null and pl.learnerfinishdate is null)) and (te.learnerstartyear = pl.learnerstartyear or (te.learnerstartyear is null and pl.learnerstartyear is null)) and (te.finishmonth = pl.finishmonth or (te.finishmonth is null and pl.finishmonth is null)) and (te.finishyear = pl.finishyear or (te.finishyear is null and pl.finishyear is null)) and (te.Gender = pl.Gender or (te.Gender is null and pl.Gender is null)) and (te.ParentFETProviderId = pl.ParentFETProviderId or (te.ParentFETProviderId is null and pl.ParentFETProviderId is null)) and (te.Parent = pl.Parent or (te.Parent is null and pl.Parent is null)) and (te.ParentDivisionProviderId = pl.ParentDivisionProviderId or (te.ParentDivisionProviderId is null and pl.ParentDivisionProviderId is null)) and (te.FETProviderTypeId = pl.FETProviderTypeId or (te.FETProviderTypeId is null and pl.FETProviderTypeId is null)) and (te.FETProviderName = pl.FETProviderName or (te.FETProviderName is null and pl.FETProviderName is null)) and (te.IsETB = pl.IsETB or (te.IsETB is null and pl.IsETB is null)) and (te.CountyId = pl.CountyId or (te.CountyId is null and pl.CountyId is null)) and (te.programmecategorydescription = pl.programmecategorydescription or (te.programmecategorydescription is null and pl.programmecategorydescription is null)) and (te.hasemployers = pl.hasemployers or (te.hasemployers is null and pl.hasemployers is null)) and (te.LearnerCountyId = pl.LearnerCountyId or (te.LearnerCountyId is null and pl.LearnerCountyId is null)) and (te.NationalityId = pl.NationalityId or (te.NationalityId is null and pl.NationalityId is null)) and (te.outcomestatusdescription = pl.outcomestatusdescription or (te.outcomestatusdescription is null and pl.outcomestatusdescription is null)) and (te.outcomecertificationdescription = pl.outcomecertificationdescription or (te.outcomecertificationdescription is null and pl.outcomecertificationdescription is null)) and (te.isoutcomecertified = pl.isoutcomecertified or (te.isoutcomecertified is null and pl.isoutcomecertified is null)) and (te.outcomecertificationawarddescrip = pl.outcomecertificationawarddescrip or (te.outcomecertificationawarddescrip is null and pl.outcomecertificationawarddescrip is null)) and (te.outcomedescription = pl.outcomedescription or (te.outcomedescription is null and pl.outcomedescription is null)) and (te.age = pl.age or (te.age is null and pl.age is null)) where pl.hash_ppsn is null) """) conn.execute(t) print('------ del >' + str(datetime.datetime.now())) t = text("drop table plss_tmp") conn.execute(t) print('----- drop >' + str(datetime.datetime.now()))
def load_data(filename, columns_list=None, the_first=None, the_last=None): """ It read and load different dataset formats into DataFrame. Also support optionally to load specified list of columns only. Additionaly help us to load the first or the last specified number of observation/rows of the dataset. Parameters ---------- filename: str Dataset name .. You should include the right directory where the dataset exist! columns_list: list of string, default ``None``, optional It help us to load specific list of columns optionally. the_first : int, default ``None``, optional Help to load the first n number of(head) observation/rows of the dataset. the_last : int, default ``None``, optional Help to load the last n number (tail) of observation/rows of the dataset. Returns ------- ``DataFrame`` It return the required DataFrame Examples -------- filename = "../ET_2016_DHS_11082019_632_141211/ETBR71SV/BR.csv" >>>list_columns(filename) >>>list_columns(filename,columns_list = ['V001','V384D']) >>>list_columns(filename,columns_list= ['V001','V384D'], the_first = 10) >>>list_columns(filename,the_last = 22) """ if filename.endswith('.csv') or filename.endswith('.CSV'): if columns_list: if the_first: return pd.read_csv(filename, usecols=columns_list).head(the_first) if the_last: return pd.read_csv(filename, usecols=columns_list).tail(the_last) return pd.read_csv(filename, usecols=columns_list) else: if the_first: return pd.read_csv(filename).head(the_first) if the_last: return pd.read_csv(filename).tail(the_last) return pd.read_csv(filename) elif filename.endswith('.DTA') or filename.endswith('.dta'): if columns_list: if the_first: df, meta = ps.read_dta(filename, usecols=columns_list) return df.head(the_first) if the_last: df, meta = ps.read_dta(filename, usecols=columns_list) return df.tail(the_last) df, meta = ps.read_dta(filename, usecols=columns_list) return df else: if the_first: df, meta = ps.read_dta(filename) return df.head(the_first) if the_last: df, meta = ps.read_dta(filename) return df.tail(the_last) df, meta = ps.read_dta(filename) return df elif filename.endswith('.SAV') or filename.endswith('.sav'): if columns_list: if the_first: df, meta = ps.read_sav(filename, usecols=columns_list) return df.head(the_first) if the_last: df, meta = ps.read_sav(filename, usecols=columns_list) return df.tail(the_last) df, meta = ps.read_sav(filename, usecols=columns_list) return df else: if the_first: df, meta = ps.read_sav(filename) return df.head(the_first) if the_last: df, meta = ps.read_sav(filename) return df.tail(the_last) df, meta = ps.read_sav(filename) return df
# Imports import pyreadstat import requests import json import pandas as pd import os import geopandas as gpd import numpy as np from sklearn.preprocessing import StandardScaler from sklearn.decomposition import PCA from sklearn import preprocessing floor_recode_df, meta = pyreadstat.read_dta('././floor_recode.dta') toilet_recode_df, meta = pyreadstat.read_dta('././toilet_recode.dta') water_recode_df, meta = pyreadstat.read_dta('././water_recode.dta') floor_recode_dict = floor_recode_df.set_index( 'floor_code').to_dict()['floor_qual'] toilet_recode_dict = toilet_recode_df.set_index( 'toilet_code').to_dict()['toilet_qual'] water_recode_dict = water_recode_df.set_index( 'water_code').to_dict()['water_qual'] water_recode_dict[63] = 4 class DHS_preparation(): def __init__(self, floor_recode: dict, toilet_recode: dict, water_recode: dict, country_code_dict: dict, features: list, info: list, dhs_survey_path: str, wealth_path: str, shape_path: str, geo_wealth_path: str,
#Importamos Dependencias o Bibliotecas necesarias: import pyreadstat # librería para leer formato ".dta" import pandas as pd # librería para manipulación de datos import numpy as np # Librería para operaciones matemáticas import matplotlib # Librería para graficar from matplotlib import pyplot as plt # Librería para graficar import statsmodels.api as sm # Librería para análisis estadístico from IPython.display import Image # Librería para importar imagénes from statsmodels.formula.api import ols # Para pruebas de hipotesis dtafile = 'Data/cgreene76.dta' # metodo para leer dataframe, meta = pyreadstat.read_dta(dtafile) print(dataframe.head(5)) dataframe['Ltotcost'] = np.log(dataframe['costs']) dataframe['Loutput'] = np.log(dataframe['output']) dataframe["Loutput_2"] = dataframe["Loutput"]**2 dataframe['Lplabor'] = np.log(dataframe['plabor']) dataframe['Lpfuel'] = np.log(dataframe['pfuel']) dataframe['Lpkap'] = np.log(dataframe['pkap']) dataframe['Lpprod'] = np.dot(dataframe['plabor'], dataframe['pfuel']) dataframe['Loutpfuel'] = np.dot(dataframe['Loutput'], dataframe['pfuel']) dataframe['One'] = 1 print(dataframe.head(10)) Y = dataframe["Ltotcost"] # que valores me faltan #X = [["One","Loutput","Loutput_2","Lplabor","Lpfuel", "Lpkap","Lpprod","Loutpfuel"]] X = dataframe[[ "One", "Loutput", "Loutput_2", "Lplabor", "Lpfuel", "Lpkap", "Lpprod", "Loutpfuel"