def main():
    #### DEFINE VARIABLES
    floor_recode_df, meta = pyreadstat.read_dta('./floor_recode.dta')
    toilet_recode_df, meta = pyreadstat.read_dta('./toilet_recode.dta')
    water_recode_df, meta = pyreadstat.read_dta('./water_recode.dta')

    floor_recode_dict = floor_recode_df.set_index(
        'floor_code').to_dict()['floor_qual']
    toilet_recode_dict = toilet_recode_df.set_index(
        'toilet_code').to_dict()['toilet_qual']
    water_recode_dict = water_recode_df.set_index(
        'water_code').to_dict()['water_qual']
    water_recode_dict[63] = 4

    country_codes = requests.get(
        'http://api.dhsprogram.com/rest/dhs/countries?returnFields=CountryName,DHS_CountryCode'
    ).text
    country_code_dict = {}
    for country in json.loads(country_codes)['Data']:
        country_code_dict[
            country['DHS_CountryCode']] = country['CountryName'].replace(
                ' ', '_')

    features = [
        'WATER_SOURCE', 'TOILET_TYPE', 'ELECTRICITY', 'RADIO', 'TV', 'FRIDGE',
        'MOTORCYCLE', 'CAR', 'FLOOR', 'ROOMSPP', 'PHONE', 'CELLPHONE'
    ]  # ,'cellphone']

    info = ['HHID', 'YEAR']
    dhs_survey_path = './dhs_household_data/'
    wealth_path = './dhs_wealth/'
    shape_path = './geo_data/'
    dhs_surveys = [
        file for file in os.listdir('./dhs_household_data/')
        if file.endswith('.SAS7BDAT')
    ]
    geo_wealth_path = './gps_csv/'
    sustainlab_group_file = 'dhs_clusters_sustainlab_group.csv'

    # initiate Class Object
    dhs_obj = DHS_preparation(floor_recode=floor_recode_dict,
                              toilet_recode=toilet_recode_dict,
                              water_recode=water_recode_dict,
                              country_code_dict=country_code_dict,
                              features=features,
                              info=info,
                              dhs_survey_path=dhs_survey_path,
                              wealth_path=wealth_path,
                              shape_path=shape_path,
                              geo_wealth_path=geo_wealth_path,
                              sustainlab_group_file=sustainlab_group_file)
    # perform dhs preparation
    # calculate wealth index and create csv files per country_year
    for filename in os.listdir(dhs_obj.dhs_survey_path):
        dhs_obj.recode_and_format_dhs(filename)
    # add geo data and create new files
    for shapefile in os.listdir(shape_path):
        dhs_obj.create_wealth_geo_df(shapefile)
    # split sustainlab clusters to files per country_year
    dhs_obj.split_sustainlab_clusters()
Exemple #2
0
 def sysuse_file(dir: str):
     try:
         self.data, self.meta = pyreadstat.read_dta(dir)
         print('(' + self.meta.file_label + ')')
         self.globals['dir'] = dir
     except pyreadstat._readstat_parser.ReadstatError:
         print_red('file \"%s\" not found' % dir)
def get_data(df_url, filepath, variables):

    survey_obj = get_survey_info(df_url, filepath)
    # read stata or sas file
    if filepath.lower().endswith('.dta'):
        df, meta = pyreadstat.read_dta(filepath,
                                       apply_value_formats=True,
                                       usecols=variables)
    elif filepath.lower().endswith('.sas7bdat'):
        df, meta = pyreadstat.read_sas7bdat(filepath, usecols=variables)

    df['survey'] = survey_obj['survey']
    df.columns = df.columns.str.upper()

    # change the variable names for household number, cluster number and line number
    # so it matches with the household memeber module
    # HV001, HV002 and HVIDX are the variables we join different modules on
    if survey_obj['module'] == 'Individual Recode':
        df.rename(columns={
            'V001': 'HV001',
            'V002': 'HV002',
            'V003': 'HVIDX'
        },
                  inplace=True)
    elif survey_obj['module'] == "Men's Recode":
        df.rename(columns={
            'MV001': 'HV001',
            'MV002': 'HV002',
            'MV003': 'HVIDX'
        },
                  inplace=True)

    # store the survey infomation in the metadata file as well
    meta_dict = meta.column_names_to_labels
    df_meta = (pd.DataFrame(
        list(meta_dict.items()),
        columns=['var', 'label']).assign(module=survey_obj['module']).assign(
            survey=survey_obj['survey']).assign(ISO=survey_obj['iso']).assign(
                year=survey_obj['year']))
    return [df, df_meta]
Exemple #4
0
def parse_file(filename):
    if "sav" in filename.lower():
        df, meta = pyreadstat.read_sav(filename,
                                       apply_value_formats=True,
                                       metadataonly=True)
        return df, meta, False
    elif "por" in filename.lower():
        df, meta = pyreadstat.read_por(filename,
                                       apply_value_formats=True,
                                       metadataonly=True)
        return df, meta, False
    elif "sas7bdat" in filename.lower():
        df, meta = pyreadstat.read_sas7bdat(filename, metadataonly=True)
        return df, meta, False
    elif "xpt" in filename.lower():
        df, meta = pyreadstat.read_xport(filename, metadataonly=True)
        return df, meta, False
    elif "dta" in filename.lower():
        df, meta = pyreadstat.read_dta(filename,
                                       apply_value_formats=True,
                                       metadataonly=True)
        return df, meta, False
    else:
        return None, None, True
Exemple #5
0
# apply the formats from the beginning
nls97spss, metaspss = pyreadstat.read_sav('data/nls97.sav', apply_value_formats=True, formats_as_category=True)
nls97spss.columns = metaspss.column_labels
nls97spss.columns = nls97spss.columns.\
  str.lower().\
  str.replace(' ','_').\
  str.replace('[^a-z0-9_]', '')
nls97spss.dtypes
nls97spss.head()
nls97spss.govt_responsibility__provide_jobs_2006.\
  value_counts(sort=False)
nls97spss.set_index('pubid__yth_id_code_1997', inplace=True)
 
# do the same for stata data
nls97stata, metastata = pyreadstat.read_dta('data/nls97.dta', apply_value_formats=True, formats_as_category=True)
nls97stata.columns = metastata.column_labels
nls97stata.columns = nls97stata.columns.\
    str.lower().\
    str.replace(' ','_').\
    str.replace('[^a-z0-9_]', '')
nls97stata.dtypes
nls97stata.head()
nls97stata.govt_responsibility__provide_jobs_2006.\
  value_counts(sort=False)
nls97stata.min()
nls97stata.replace(list(range(-9,0)), np.nan, inplace=True)
nls97stata.min()
nls97stata.set_index('pubid__yth_id_code_1997', inplace=True)

# pull sas data, using the sas catalog file for value labels
Exemple #6
0
    def read(self):
        engine = sa.create_engine(self.db)
        conn = engine.connect()
        print('----  begin >' + str(datetime.datetime.now()))
        print('-----  read >' + str(datetime.datetime.now()))
        df, meta = pyreadstat.read_dta(self.filename)

        df.to_sql("plss_tmp", con=engine, if_exists="replace")
        print('----  merge >' + str(datetime.datetime.now()))
        t = text("""
                      insert into plss ( CourseParticipationId,CourseCalendarId,ApplicationId,DateStartCourse,DateActualFinishCourse,
                                         OutcomeId,DateDeleted,OutcomeStatusId,OutcomeCertificationId,OutcomeCertificationAwardId,
                                         OutcomeEarlyFinishReasonId,OriginalOutcomeId,ApplicationOriginId,ApplicationStatusId,FETProviderId,
                                         DeliveryFETProviderId,ProgrammeId,TargetAwardId,PublishedCourseTitle,ProgrammeCategoryId,
                                         DateActualStart,DateActualFinish,DeliveryModeId,Title,Cluster,ProgrammeAwardLevel,
                                         ProgrammeCertified,TargetAward,AwardAchievable,AwardSummary,ISCEDBroadUID,ISCEDDetailedFieldID,
                                         learnerfinishdate,learnerstartyear,finishmonth,finishyear,Gender,ParentFETProviderId,Parent,
                                         ParentDivisionProviderId,FETProviderTypeId,FETProviderName,IsETB,CountyId,
                                         programmecategorydescription,hasemployers,LearnerCountyId,NationalityId,outcomestatusdescription,
                                         outcomecertificationdescription,isoutcomecertified,outcomecertificationawarddescrip,
                                         outcomedescription,hash_ppsn,age )
                          select te.CourseParticipationId,te.CourseCalendarId,te.ApplicationId,te.DateStartCourse,te.DateActualFinishCourse,
                                 te.OutcomeId,te.DateDeleted,te.OutcomeStatusId,te.OutcomeCertificationId,te.OutcomeCertificationAwardId,
                                 te.OutcomeEarlyFinishReasonId,te.OriginalOutcomeId,te.ApplicationOriginId,te.ApplicationStatusId,te.FETProviderId,
                                 te.DeliveryFETProviderId,te.ProgrammeId,te.TargetAwardId,te.PublishedCourseTitle,te.ProgrammeCategoryId,
                                 te.DateActualStart,te.DateActualFinish,te.DeliveryModeId,te.Title,te.Cluster,te.ProgrammeAwardLevel,
                                 te.ProgrammeCertified,te.TargetAward,te.AwardAchievable,te.AwardSummary,te.ISCEDBroadUID,te.ISCEDDetailedFieldID,
                                 te.learnerfinishdate,te.learnerstartyear,te.finishmonth,te.finishyear,te.Gender,te.ParentFETProviderId,te.Parent,
                                 te.ParentDivisionProviderId,te.FETProviderTypeId,te.FETProviderName,te.IsETB,te.CountyId,
                                 te.programmecategorydescription,te.hasemployers,te.LearnerCountyId,te.NationalityId,te.outcomestatusdescription,
                                 te.outcomecertificationdescription,te.isoutcomecertified,te.outcomecertificationawarddescrip,
                                 te.outcomedescription,te.hash_ppsn,te.age                 
	                      from plss_tmp te
                          left join plss pl
                              on te.hash_ppsn = pl.hash_ppsn
                                  and te.CourseParticipationId = pl.CourseParticipationId
                                  and te.CourseCalendarId = pl.CourseCalendarId 
                                  and te.ApplicationId = pl.ApplicationId
                                  and te.DateStartCourse = pl.DateStartCourse
                                  and (te.DateActualFinishCourse = pl.DateActualFinishCourse or (te.DateActualFinishCourse is null and pl.DateActualFinishCourse is null))
                                  and (te.OutcomeId = pl.OutcomeId or (te.OutcomeId is null and pl.OutcomeId is null))
                                  and (te.DateDeleted = pl.DateDeleted or (te.DateDeleted is null and pl.DateDeleted is null))
                                  and (te.OutcomeStatusId = pl.OutcomeStatusId or (te.OutcomeStatusId is null and pl.OutcomeStatusId is null))
                                  and (te.OutcomeCertificationId = pl.OutcomeCertificationId or (te.OutcomeCertificationId is null and pl.OutcomeCertificationId is null))
                                  and (te.OutcomeCertificationAwardId = pl.OutcomeCertificationAwardId or (te.OutcomeCertificationAwardId is null and pl.OutcomeCertificationAwardId is null))
                                  and (te.OutcomeEarlyFinishReasonId = pl.OutcomeEarlyFinishReasonId or (te.OutcomeEarlyFinishReasonId is null and pl.OutcomeEarlyFinishReasonId is null))
                                  and (te.OriginalOutcomeId = pl.OriginalOutcomeId or (te.OriginalOutcomeId is null and pl.OriginalOutcomeId is null))
                                  and (te.ApplicationOriginId = pl.ApplicationOriginId or (te.ApplicationOriginId is null and pl.ApplicationOriginId is null))
                                  and (te.ApplicationStatusId = pl.ApplicationStatusId or (te.ApplicationStatusId is null and pl.ApplicationStatusId is null))
                                  and (te.FETProviderId = pl.FETProviderId or (te.FETProviderId is null and pl.FETProviderId is null))
                                  and (te.DeliveryFETProviderId = pl.DeliveryFETProviderId or (te.DeliveryFETProviderId is null and pl.DeliveryFETProviderId is null))
                                  and (te.ProgrammeId = pl.ProgrammeId or (te.ProgrammeId is null and pl.ProgrammeId is null))
                                  and (te.TargetAwardId = pl.TargetAwardId or (te.TargetAwardId is null and pl.TargetAwardId is null))
                                  and (te.PublishedCourseTitle = pl.PublishedCourseTitle or (te.PublishedCourseTitle is null and pl.PublishedCourseTitle is null))
                                  and (te.ProgrammeCategoryId = pl.ProgrammeCategoryId or (te.ProgrammeCategoryId is null and pl.ProgrammeCategoryId is null))
                                  and (te.DateActualStart = pl.DateActualStart or (te.DateActualStart is null and pl.DateActualStart is null))
                                  and (te.DateActualFinish = pl.DateActualFinish or (te.DateActualFinish is null and pl.DateActualFinish is null))
                                  and (te.DeliveryModeId = pl.DeliveryModeId or (te.DeliveryModeId is null and pl.DeliveryModeId is null))
                                  and te.Title = pl.Title 
                                  and (te.Cluster = pl.Cluster or (te.Cluster is null and pl.Cluster is null))
                                  and (te.ProgrammeAwardLevel = pl.ProgrammeAwardLevel or (te.ProgrammeAwardLevel is null and pl.ProgrammeAwardLevel is null))
                                  and (te.ProgrammeCertified = pl.ProgrammeCertified or (te.ProgrammeCertified is null and pl.ProgrammeCertified is null))
                                  and (te.TargetAward = pl.TargetAward or (te.TargetAward is null and pl.TargetAward is null))
                                  and (te.AwardAchievable = pl.AwardAchievable or (te.AwardAchievable is null and pl.AwardAchievable is null))
                                  and (te.AwardSummary = pl.AwardSummary or (te.AwardSummary is null and pl.AwardSummary is null))
                                  and (te.ISCEDBroadUID = pl.ISCEDBroadUID or (te.ISCEDBroadUID is null and pl.ISCEDBroadUID is null))
                                  and (te.ISCEDDetailedFieldID = pl.ISCEDDetailedFieldID or (te.ISCEDDetailedFieldID is null and pl.ISCEDDetailedFieldID is null))
                                  and (te.learnerfinishdate = pl.learnerfinishdate or (te.learnerfinishdate is null and pl.learnerfinishdate is null))
                                  and (te.learnerstartyear = pl.learnerstartyear or (te.learnerstartyear is null and pl.learnerstartyear is null))
                                  and (te.finishmonth = pl.finishmonth or (te.finishmonth is null and pl.finishmonth is null))
                                  and (te.finishyear = pl.finishyear or (te.finishyear is null and pl.finishyear is null))
                                  and (te.Gender = pl.Gender or (te.Gender is null and pl.Gender is null))
                                  and (te.ParentFETProviderId = pl.ParentFETProviderId or (te.ParentFETProviderId is null and pl.ParentFETProviderId is null))
                                  and (te.Parent = pl.Parent or (te.Parent is null and pl.Parent is null))
                                  and (te.ParentDivisionProviderId = pl.ParentDivisionProviderId or (te.ParentDivisionProviderId is null and pl.ParentDivisionProviderId is null))
                                  and (te.FETProviderTypeId = pl.FETProviderTypeId or (te.FETProviderTypeId is null and pl.FETProviderTypeId is null))
                                  and (te.FETProviderName = pl.FETProviderName or (te.FETProviderName is null and pl.FETProviderName is null))
                                  and (te.IsETB = pl.IsETB or (te.IsETB is null and pl.IsETB is null))
                                  and (te.CountyId = pl.CountyId or (te.CountyId is null and pl.CountyId is null))
                                  and (te.programmecategorydescription = pl.programmecategorydescription or (te.programmecategorydescription is null and pl.programmecategorydescription is null))
                                  and (te.hasemployers = pl.hasemployers or (te.hasemployers is null and pl.hasemployers is null))
                                  and (te.LearnerCountyId = pl.LearnerCountyId or (te.LearnerCountyId is null and pl.LearnerCountyId is null))
                                  and (te.NationalityId = pl.NationalityId or (te.NationalityId is null and pl.NationalityId is null))
                                  and (te.outcomestatusdescription = pl.outcomestatusdescription or (te.outcomestatusdescription is null and pl.outcomestatusdescription is null))
                                  and (te.outcomecertificationdescription = pl.outcomecertificationdescription or (te.outcomecertificationdescription is null and pl.outcomecertificationdescription is null))
                                  and (te.isoutcomecertified = pl.isoutcomecertified or (te.isoutcomecertified is null and pl.isoutcomecertified is null))
                                  and (te.outcomecertificationawarddescrip = pl.outcomecertificationawarddescrip or (te.outcomecertificationawarddescrip is null and pl.outcomecertificationawarddescrip is null))
                                  and (te.outcomedescription = pl.outcomedescription or (te.outcomedescription is null and pl.outcomedescription is null))
                                  and (te.age = pl.age or (te.age is null and pl.age is null))
                          where pl.hash_ppsn is null
                 """)
        conn.execute(t)
        print('------  add >' + str(datetime.datetime.now()))
        t = text("""
                      delete
                          from plss
                          where id in (select pl.id
                                           from plss pl
                                               left join plss_tmp te
                                                   on te.hash_ppsn = pl.hash_ppsn
                                                       and (te.CourseParticipationId = pl.CourseParticipationId or (te.CourseParticipationId is null and pl.CourseParticipationId is null))
                                                       and (te.CourseCalendarId = pl.CourseCalendarId or (te.CourseCalendarId is null and pl.CourseCalendarId is null))
                                                       and (te.ApplicationId = pl.ApplicationId or (te.ApplicationId is null and pl.ApplicationId is null))
                                                       and (te.DateStartCourse = pl.DateStartCourse or (te.DateStartCourse is null and pl.DateStartCourse is null))
                                                       and (te.DateActualFinishCourse = pl.DateActualFinishCourse or (te.DateActualFinishCourse is null and pl.DateActualFinishCourse is null))
                                                       and (te.OutcomeId = pl.OutcomeId or (te.OutcomeId is null and pl.OutcomeId is null))
                                                       and (te.DateDeleted = pl.DateDeleted or (te.DateDeleted is null and pl.DateDeleted is null))
                                                       and (te.OutcomeStatusId = pl.OutcomeStatusId or (te.OutcomeStatusId is null and pl.OutcomeStatusId is null))
                                                       and (te.OutcomeCertificationId = pl.OutcomeCertificationId or (te.OutcomeCertificationId is null and pl.OutcomeCertificationId is null))
                                                       and (te.OutcomeCertificationAwardId = pl.OutcomeCertificationAwardId or (te.OutcomeCertificationAwardId is null and pl.OutcomeCertificationAwardId is null))
                                                       and (te.OutcomeEarlyFinishReasonId = pl.OutcomeEarlyFinishReasonId or (te.OutcomeEarlyFinishReasonId is null and pl.OutcomeEarlyFinishReasonId is null))
                                                       and (te.OriginalOutcomeId = pl.OriginalOutcomeId or (te.OriginalOutcomeId is null and pl.OriginalOutcomeId is null))
                                                       and (te.ApplicationOriginId = pl.ApplicationOriginId or (te.ApplicationOriginId is null and pl.ApplicationOriginId is null))
                                                       and (te.ApplicationStatusId = pl.ApplicationStatusId or (te.ApplicationStatusId is null and pl.ApplicationStatusId is null))
                                                       and (te.FETProviderId = pl.FETProviderId or (te.FETProviderId is null and pl.FETProviderId is null))
                                                       and (te.DeliveryFETProviderId = pl.DeliveryFETProviderId or (te.DeliveryFETProviderId is null and pl.DeliveryFETProviderId is null))
                                                       and (te.ProgrammeId = pl.ProgrammeId or (te.ProgrammeId is null and pl.ProgrammeId is null))
                                                       and (te.TargetAwardId = pl.TargetAwardId or (te.TargetAwardId is null and pl.TargetAwardId is null))
                                                       and (te.PublishedCourseTitle = pl.PublishedCourseTitle or (te.PublishedCourseTitle is null and pl.PublishedCourseTitle is null))
                                                       and (te.ProgrammeCategoryId = pl.ProgrammeCategoryId or (te.ProgrammeCategoryId is null and pl.ProgrammeCategoryId is null))
                                                       and (te.DateActualStart = pl.DateActualStart or (te.DateActualStart is null and pl.DateActualStart is null))
                                                       and (te.DateActualFinish = pl.DateActualFinish or (te.DateActualFinish is null and pl.DateActualFinish is null))
                                                       and (te.DeliveryModeId = pl.DeliveryModeId or (te.DeliveryModeId is null and pl.DeliveryModeId is null))
                                                       and (te.Title = pl.Title or (te.Title is null and pl.Title is null))
                                                       and (te.Cluster = pl.Cluster or (te.Cluster is null and pl.Cluster is null))
                                                       and (te.ProgrammeAwardLevel = pl.ProgrammeAwardLevel or (te.ProgrammeAwardLevel is null and pl.ProgrammeAwardLevel is null))
                                                       and (te.ProgrammeCertified = pl.ProgrammeCertified or (te.ProgrammeCertified is null and pl.ProgrammeCertified is null))
                                                       and (te.TargetAward = pl.TargetAward or (te.TargetAward is null and pl.TargetAward is null))
                                                       and (te.AwardAchievable = pl.AwardAchievable or (te.AwardAchievable is null and pl.AwardAchievable is null))
                                                       and (te.AwardSummary = pl.AwardSummary or (te.AwardSummary is null and pl.AwardSummary is null))
                                                       and (te.ISCEDBroadUID = pl.ISCEDBroadUID or (te.ISCEDBroadUID is null and pl.ISCEDBroadUID is null))
                                                       and (te.ISCEDDetailedFieldID = pl.ISCEDDetailedFieldID or (te.ISCEDDetailedFieldID is null and pl.ISCEDDetailedFieldID is null))
                                                       and (te.learnerfinishdate = pl.learnerfinishdate or (te.learnerfinishdate is null and pl.learnerfinishdate is null))
                                                       and (te.learnerstartyear = pl.learnerstartyear or (te.learnerstartyear is null and pl.learnerstartyear is null))
                                                       and (te.finishmonth = pl.finishmonth or (te.finishmonth is null and pl.finishmonth is null))
                                                       and (te.finishyear = pl.finishyear or (te.finishyear is null and pl.finishyear is null))
                                                       and (te.Gender = pl.Gender or (te.Gender is null and pl.Gender is null))
                                                       and (te.ParentFETProviderId = pl.ParentFETProviderId or (te.ParentFETProviderId is null and pl.ParentFETProviderId is null))
                                                       and (te.Parent = pl.Parent or (te.Parent is null and pl.Parent is null))
                                                       and (te.ParentDivisionProviderId = pl.ParentDivisionProviderId or (te.ParentDivisionProviderId is null and pl.ParentDivisionProviderId is null))
                                                       and (te.FETProviderTypeId = pl.FETProviderTypeId or (te.FETProviderTypeId is null and pl.FETProviderTypeId is null))
                                                       and (te.FETProviderName = pl.FETProviderName or (te.FETProviderName is null and pl.FETProviderName is null))
                                                       and (te.IsETB = pl.IsETB or (te.IsETB is null and pl.IsETB is null))
                                                       and (te.CountyId = pl.CountyId or (te.CountyId is null and pl.CountyId is null))
                                                       and (te.programmecategorydescription = pl.programmecategorydescription or (te.programmecategorydescription is null and pl.programmecategorydescription is null))
                                                       and (te.hasemployers = pl.hasemployers or (te.hasemployers is null and pl.hasemployers is null))
                                                       and (te.LearnerCountyId = pl.LearnerCountyId or (te.LearnerCountyId is null and pl.LearnerCountyId is null))
                                                       and (te.NationalityId = pl.NationalityId or (te.NationalityId is null and pl.NationalityId is null))
                                                       and (te.outcomestatusdescription = pl.outcomestatusdescription or (te.outcomestatusdescription is null and pl.outcomestatusdescription is null))
                                                       and (te.outcomecertificationdescription = pl.outcomecertificationdescription or (te.outcomecertificationdescription is null and pl.outcomecertificationdescription is null))
                                                       and (te.isoutcomecertified = pl.isoutcomecertified or (te.isoutcomecertified is null and pl.isoutcomecertified is null))
                                                       and (te.outcomecertificationawarddescrip = pl.outcomecertificationawarddescrip or (te.outcomecertificationawarddescrip is null and pl.outcomecertificationawarddescrip is null))
                                                       and (te.outcomedescription = pl.outcomedescription or (te.outcomedescription is null and pl.outcomedescription is null))
                                                       and (te.age = pl.age or (te.age is null and pl.age is null))
                                               where pl.hash_ppsn is null)
                   """)
        conn.execute(t)
        print('------  del >' + str(datetime.datetime.now()))
        t = text("drop table plss_tmp")
        conn.execute(t)
        print('-----  drop >' + str(datetime.datetime.now()))
Exemple #7
0
def load_data(filename, columns_list=None, the_first=None, the_last=None):
    """ 
    It read and load different dataset formats into DataFrame. 
    Also support optionally to load specified list of columns only.
    Additionaly help us to load the first or the last specified number
    of observation/rows of the dataset. 

    Parameters
    ----------
    filename: str
        Dataset name 
        .. You should include the right directory where the dataset exist!
        
    columns_list: list of string, default ``None``, optional
        It help us to load specific list of columns optionally. 
    
    the_first : int, default ``None``, optional
        Help to load the first n number of(head) observation/rows of the dataset.
        
    the_last : int, default ``None``, optional
        Help to load the last n number (tail) of observation/rows of the dataset.
    
    Returns
    -------
    ``DataFrame``
        It return the required DataFrame  
    Examples
    --------
    filename = "../ET_2016_DHS_11082019_632_141211/ETBR71SV/BR.csv"
    >>>list_columns(filename)
    >>>list_columns(filename,columns_list = ['V001','V384D'])
    >>>list_columns(filename,columns_list= ['V001','V384D'], the_first = 10)
    >>>list_columns(filename,the_last = 22)
    """
    if filename.endswith('.csv') or filename.endswith('.CSV'):
        if columns_list:
            if the_first:
                return pd.read_csv(filename,
                                   usecols=columns_list).head(the_first)
            if the_last:
                return pd.read_csv(filename,
                                   usecols=columns_list).tail(the_last)
            return pd.read_csv(filename, usecols=columns_list)
        else:
            if the_first:
                return pd.read_csv(filename).head(the_first)
            if the_last:
                return pd.read_csv(filename).tail(the_last)
            return pd.read_csv(filename)

    elif filename.endswith('.DTA') or filename.endswith('.dta'):
        if columns_list:
            if the_first:
                df, meta = ps.read_dta(filename, usecols=columns_list)
                return df.head(the_first)
            if the_last:
                df, meta = ps.read_dta(filename, usecols=columns_list)
                return df.tail(the_last)
            df, meta = ps.read_dta(filename, usecols=columns_list)
            return df
        else:
            if the_first:
                df, meta = ps.read_dta(filename)
                return df.head(the_first)
            if the_last:
                df, meta = ps.read_dta(filename)
                return df.tail(the_last)

            df, meta = ps.read_dta(filename)
            return df

    elif filename.endswith('.SAV') or filename.endswith('.sav'):
        if columns_list:
            if the_first:
                df, meta = ps.read_sav(filename, usecols=columns_list)
                return df.head(the_first)
            if the_last:
                df, meta = ps.read_sav(filename, usecols=columns_list)
                return df.tail(the_last)
            df, meta = ps.read_sav(filename, usecols=columns_list)
            return df
        else:
            if the_first:
                df, meta = ps.read_sav(filename)
                return df.head(the_first)
            if the_last:
                df, meta = ps.read_sav(filename)
                return df.tail(the_last)

            df, meta = ps.read_sav(filename)
            return df
# Imports
import pyreadstat
import requests
import json
import pandas as pd
import os
import geopandas as gpd

import numpy as np
from sklearn.preprocessing import StandardScaler
from sklearn.decomposition import PCA
from sklearn import preprocessing

floor_recode_df, meta = pyreadstat.read_dta('././floor_recode.dta')
toilet_recode_df, meta = pyreadstat.read_dta('././toilet_recode.dta')
water_recode_df, meta = pyreadstat.read_dta('././water_recode.dta')

floor_recode_dict = floor_recode_df.set_index(
    'floor_code').to_dict()['floor_qual']
toilet_recode_dict = toilet_recode_df.set_index(
    'toilet_code').to_dict()['toilet_qual']
water_recode_dict = water_recode_df.set_index(
    'water_code').to_dict()['water_qual']
water_recode_dict[63] = 4


class DHS_preparation():
    def __init__(self, floor_recode: dict, toilet_recode: dict,
                 water_recode: dict, country_code_dict: dict, features: list,
                 info: list, dhs_survey_path: str, wealth_path: str,
                 shape_path: str, geo_wealth_path: str,
Exemple #9
0
#Importamos Dependencias o Bibliotecas necesarias:
import pyreadstat  # librería para leer formato ".dta"
import pandas as pd  # librería para manipulación de datos
import numpy as np  # Librería para operaciones matemáticas
import matplotlib  # Librería para graficar
from matplotlib import pyplot as plt  # Librería para graficar
import statsmodels.api as sm  # Librería para análisis estadístico
from IPython.display import Image  # Librería para importar imagénes
from statsmodels.formula.api import ols  # Para pruebas de hipotesis

dtafile = 'Data/cgreene76.dta'
# metodo para leer
dataframe, meta = pyreadstat.read_dta(dtafile)
print(dataframe.head(5))
dataframe['Ltotcost'] = np.log(dataframe['costs'])
dataframe['Loutput'] = np.log(dataframe['output'])
dataframe["Loutput_2"] = dataframe["Loutput"]**2
dataframe['Lplabor'] = np.log(dataframe['plabor'])
dataframe['Lpfuel'] = np.log(dataframe['pfuel'])
dataframe['Lpkap'] = np.log(dataframe['pkap'])
dataframe['Lpprod'] = np.dot(dataframe['plabor'], dataframe['pfuel'])
dataframe['Loutpfuel'] = np.dot(dataframe['Loutput'], dataframe['pfuel'])
dataframe['One'] = 1
print(dataframe.head(10))

Y = dataframe["Ltotcost"]
# que valores me faltan
#X = [["One","Loutput","Loutput_2","Lplabor","Lpfuel", "Lpkap","Lpprod","Loutpfuel"]]
X = dataframe[[
    "One", "Loutput", "Loutput_2", "Lplabor", "Lpfuel", "Lpkap", "Lpprod",
    "Loutpfuel"