Esempi in Python per convert_free_text, esempi in Python per util.ETL_helpers.convert_free_text

Esempio n. 1

0

Mostra file

File: clean_noble.py Progetto: dssg/education-college-public

def ap(noble_aps):
    #initialize new df
    ap_tests = pd.DataFrame()
    # add id
    ap_tests['noble_powerschool_id'] = noble_aps['student_number']
    # add date
    ap_tests['date'] = pd.to_datetime(noble_aps['test_date'])
    # add score 
    ap_tests['score'] = noble_aps['numscore']
    # load ap dictionary from manually coded csv
    subject_dict = create_conversion_dict(config.PERSISTENCE_PATH + '/code/etl/mappers/APsubjecttranslation.csv')
    # do conversions from freetext to coded options
    noble_aps.testname = noble_aps.testname.apply(lambda x: x.replace('AP ', '')) #remove the 'AP '
    ap_tests['subject'] = noble_aps.testname.fillna('missing').apply(lambda x: convert_free_text(subject_dict, x))
    return ap_tests

Esempio n. 2

0

Mostra file

File: clean_noble.py Progetto: dssg/education-college-public

def ap(noble_aps):
    #initialize new df
    ap_tests = pd.DataFrame()
    # add id
    ap_tests['noble_powerschool_id'] = noble_aps['student_number']
    # add date
    ap_tests['date'] = pd.to_datetime(noble_aps['test_date'])
    # add score
    ap_tests['score'] = noble_aps['numscore']
    # load ap dictionary from manually coded csv
    subject_dict = create_conversion_dict(
        config.PERSISTENCE_PATH + '/code/etl/mappers/APsubjecttranslation.csv')
    # do conversions from freetext to coded options
    noble_aps.testname = noble_aps.testname.apply(
        lambda x: x.replace('AP ', ''))  #remove the 'AP '
    ap_tests['subject'] = noble_aps.testname.fillna('missing').apply(
        lambda x: convert_free_text(subject_dict, x))
    return ap_tests

Esempio n. 3

0

Mostra file

File: clean_noble.py Progetto: dssg/education-college-public

def enrollments(enrollment):

    # initialize clean dataframe 
    clean_enrollment = pd.DataFrame()

    # add student and college ids
    clean_enrollment['noble_student_sf_id'] = enrollment['Student__c']
    clean_enrollment['noble_college_sf_id'] = enrollment['College__c']

    # convert date columns to datetime to do date cleaning
    enrollment['Start_Date__c'] = pd.to_datetime(enrollment['Start_Date__c'])
    enrollment['End_Date__c'] = pd.to_datetime(enrollment['End_Date__c'])
    enrollment['Date_Last_Verified__c'] = pd.to_datetime(enrollment['Date_Last_Verified__c'])

    #remove weird dates, add to new df
    clean_enrollment['start_date'] = enrollment.Start_Date__c.apply(wrong_dates_to_null)
    clean_enrollment['end_date'] = enrollment.End_Date__c.apply(wrong_dates_to_null)
    clean_enrollment['date_last_verified'] = enrollment.Date_Last_Verified__c.apply(wrong_dates_to_null)

    # add status, data source, and degree type as is
    clean_enrollment['status'] = enrollment.Status__c
    clean_enrollment['data_source'] = enrollment.Data_Source__c

    # Living on campus
    clean_enrollment['living_on_campus'] = np.nan

    # Degree type, without apostrophes
    clean_enrollment['degree_type'] = enrollment.Degree_Type__c.dropna().apply(lambda x: str(x).replace("'",""))

    # load degree and major dictionaries from manually coded csvs
    degree_dict = create_conversion_dict(config.PERSISTENCE_PATH + '/code/etl/mappers/degreetranslation.csv')
    major_dict = create_conversion_dict(config.PERSISTENCE_PATH + '/code/etl/mappers/majortranslation.csv')

    # do conversions from freetext to coded options
    clean_enrollment['degree_subject'] = enrollment.Degree_Text__c.fillna('missing').apply(lambda x: convert_free_text(degree_dict, x))
    clean_enrollment['major'] = enrollment.Major_Text__c.fillna('missing').apply(lambda x: convert_free_text(major_dict, x))

    # convert withdrawal reasons to several boolean categories
    def boolean_withdrawal(dummy_code, input_code):
        if input_code == 'missing':
            return np.nan
        else:
            codes = input_code.split(';')
            dummy_booleans = [True if code == dummy_code else False for code in codes]
            if sum(dummy_booleans) > 0:
                return True
            else: return False

    withdrawal_codes = ['Financial', 'Academic', 'Motivational', 'Family', 'Health', 'Social', 'Racial Conflict']
    for dummy_code in withdrawal_codes:
        clean_enrollment['withdrawal_reason_' + "_".join(dummy_code.lower().split())] = enrollment.Withdrawal_code__c.fillna('missing').apply(lambda x: boolean_withdrawal(dummy_code, x))
    clean_enrollment.rename(columns={'withdrawal_reason_racial_conflict': 'withdrawal_reason_racial'}, inplace=True)

    # Return clean df
    return clean_enrollment

Esempio n. 4

0

Mostra file

File: clean_kippnj.py Progetto: dssg/education-college-public

def enrollments_table(kipp_nj_enrollments):

	'''Cleans KIPP NJ enrollment data to match our database schema'''

	# remove the did not enroll, other, and deferred enrollment types
	clean_enrollments = kipp_nj_enrollments[~kipp_nj_enrollments.status.isin(['Other', 'Did Not Enroll', 'Deferred'])]
	# rename Withdrawn to withdrew, matriculated to matriculating

	clean_enrollments.status = clean_enrollments.status.apply(lambda x: map_value_from_dict(status_fixed_mapping, x))
	# made dates into dates
	clean_enrollments.start_date = pd.to_datetime(clean_enrollments.start_date)
	clean_enrollments.end_date = pd.to_datetime(clean_enrollments.end_date)
	clean_enrollments.date_last_verified = pd.to_datetime(clean_enrollments.date_last_verified)
	# deal with degree type
	#remove high school diploma and GED, only interested in college enrollments
	clean_enrollments = clean_enrollments[~clean_enrollments.degree_type.isin(['High School Diploma', 'GED'])]
	clean_enrollments.degree_type = clean_enrollments.degree_type.apply(lambda x: map_value_from_dict(degree_fixed_mapping, x))
	# clean up degree subject
	clean_enrollments.degree_subject= clean_enrollments.degree_type.combine(clean_enrollments.degree_subject, func = code_degree_subject)
	# clean up major
	major_dict = create_conversion_dict(config.PERSISTENCE_PATH + '/code/etl/mappers/majortranslation.csv')
	# do conversions from freetext to coded options
	clean_enrollments['major'] = clean_enrollments.major.fillna('missing').apply(lambda x: convert_free_text(major_dict, x))
	#map transfer reasons to the reasons we keep track of
	withdrawal_reasons = pd.get_dummies(clean_enrollments.transfer_reason__c).astype(bool)
	withdrawal_reasons.columns = ['withdrawal_reason_academic', 'withdrawal_reason_career', 'withdrawal_reason_financial', 'withdrawal_reason_other', 'withdrawal_reason_placement', 'withdrawal_reason_relocation', 'withdrawal_reason_social']
	withdrawal_reasons.drop(['withdrawal_reason_relocation', 'withdrawal_reason_placement', 'withdrawal_reason_other', 'withdrawal_reason_career'], axis = 1,inplace = True)
	# join the columns back into the original 
	clean_enrollments = clean_enrollments.join(withdrawal_reasons)
	clean_enrollments['withdrawal_reason_motivational'] = np.nan
	clean_enrollments['withdrawal_reason_family'] = np.nan 
	clean_enrollments['withdrawal_reason_health'] = np.nan
	clean_enrollments['withdrawal_reason_racial'] = np.nan

	# Drop invalid IPEDS id
	clean_enrollments['college_ncesid'] = clean_enrollments['college_ncesid'].convert_objects(convert_numeric=True)
	clean_enrollments.loc[clean_enrollments['college_ncesid'] > 999999, 'college_ncesid'] = np.nan
	clean_enrollments['college_ncesid'] = clean_enrollments['college_ncesid'].apply(int_with_NaN_tostr)

	clean_enrollments.drop(['transfer_reason__c', 'college_salesforce_id'],axis = 1,inplace = True)
	clean_enrollments.rename(columns={'student_salesforce_id':'kipp_nj_sf_id', 'college_ncesid': 'ipedsid'}, inplace=True)	
	return clean_enrollments

Esempio n. 5

0

Mostra file

File: clean_noble.py Progetto: dssg/education-college-public

def enrollments(enrollment):

    # initialize clean dataframe
    clean_enrollment = pd.DataFrame()

    # add student and college ids
    clean_enrollment['noble_student_sf_id'] = enrollment['Student__c']
    clean_enrollment['noble_college_sf_id'] = enrollment['College__c']

    # convert date columns to datetime to do date cleaning
    enrollment['Start_Date__c'] = pd.to_datetime(enrollment['Start_Date__c'])
    enrollment['End_Date__c'] = pd.to_datetime(enrollment['End_Date__c'])
    enrollment['Date_Last_Verified__c'] = pd.to_datetime(
        enrollment['Date_Last_Verified__c'])

    #remove weird dates, add to new df
    clean_enrollment['start_date'] = enrollment.Start_Date__c.apply(
        wrong_dates_to_null)
    clean_enrollment['end_date'] = enrollment.End_Date__c.apply(
        wrong_dates_to_null)
    clean_enrollment[
        'date_last_verified'] = enrollment.Date_Last_Verified__c.apply(
            wrong_dates_to_null)

    # add status, data source, and degree type as is
    clean_enrollment['status'] = enrollment.Status__c
    clean_enrollment['data_source'] = enrollment.Data_Source__c

    # Living on campus
    clean_enrollment['living_on_campus'] = np.nan

    # Degree type, without apostrophes
    clean_enrollment['degree_type'] = enrollment.Degree_Type__c.dropna().apply(
        lambda x: str(x).replace("'", ""))

    # load degree and major dictionaries from manually coded csvs
    degree_dict = create_conversion_dict(
        config.PERSISTENCE_PATH + '/code/etl/mappers/degreetranslation.csv')
    major_dict = create_conversion_dict(
        config.PERSISTENCE_PATH + '/code/etl/mappers/majortranslation.csv')

    # do conversions from freetext to coded options
    clean_enrollment['degree_subject'] = enrollment.Degree_Text__c.fillna(
        'missing').apply(lambda x: convert_free_text(degree_dict, x))
    clean_enrollment['major'] = enrollment.Major_Text__c.fillna(
        'missing').apply(lambda x: convert_free_text(major_dict, x))

    # convert withdrawal reasons to several boolean categories
    def boolean_withdrawal(dummy_code, input_code):
        if input_code == 'missing':
            return np.nan
        else:
            codes = input_code.split(';')
            dummy_booleans = [
                True if code == dummy_code else False for code in codes
            ]
            if sum(dummy_booleans) > 0:
                return True
            else:
                return False

    withdrawal_codes = [
        'Financial', 'Academic', 'Motivational', 'Family', 'Health', 'Social',
        'Racial Conflict'
    ]
    for dummy_code in withdrawal_codes:
        clean_enrollment['withdrawal_reason_' + "_".join(dummy_code.lower(
        ).split())] = enrollment.Withdrawal_code__c.fillna('missing').apply(
            lambda x: boolean_withdrawal(dummy_code, x))
    clean_enrollment.rename(columns={
        'withdrawal_reason_racial_conflict':
        'withdrawal_reason_racial'
    },
                            inplace=True)

    # Return clean df
    return clean_enrollment

Esempio n. 6

0

Mostra file

def enrollments_table(kipp_nj_enrollments):
    '''Cleans KIPP NJ enrollment data to match our database schema'''

    # remove the did not enroll, other, and deferred enrollment types
    clean_enrollments = kipp_nj_enrollments[~kipp_nj_enrollments.status.isin(
        ['Other', 'Did Not Enroll', 'Deferred'])]
    # rename Withdrawn to withdrew, matriculated to matriculating

    clean_enrollments.status = clean_enrollments.status.apply(
        lambda x: map_value_from_dict(status_fixed_mapping, x))
    # made dates into dates
    clean_enrollments.start_date = pd.to_datetime(clean_enrollments.start_date)
    clean_enrollments.end_date = pd.to_datetime(clean_enrollments.end_date)
    clean_enrollments.date_last_verified = pd.to_datetime(
        clean_enrollments.date_last_verified)
    # deal with degree type
    #remove high school diploma and GED, only interested in college enrollments
    clean_enrollments = clean_enrollments[~clean_enrollments.degree_type.
                                          isin(['High School Diploma', 'GED'])]
    clean_enrollments.degree_type = clean_enrollments.degree_type.apply(
        lambda x: map_value_from_dict(degree_fixed_mapping, x))
    # clean up degree subject
    clean_enrollments.degree_subject = clean_enrollments.degree_type.combine(
        clean_enrollments.degree_subject, func=code_degree_subject)
    # clean up major
    major_dict = create_conversion_dict(
        config.PERSISTENCE_PATH + '/code/etl/mappers/majortranslation.csv')
    # do conversions from freetext to coded options
    clean_enrollments['major'] = clean_enrollments.major.fillna(
        'missing').apply(lambda x: convert_free_text(major_dict, x))
    #map transfer reasons to the reasons we keep track of
    withdrawal_reasons = pd.get_dummies(
        clean_enrollments.transfer_reason__c).astype(bool)
    withdrawal_reasons.columns = [
        'withdrawal_reason_academic', 'withdrawal_reason_career',
        'withdrawal_reason_financial', 'withdrawal_reason_other',
        'withdrawal_reason_placement', 'withdrawal_reason_relocation',
        'withdrawal_reason_social'
    ]
    withdrawal_reasons.drop([
        'withdrawal_reason_relocation', 'withdrawal_reason_placement',
        'withdrawal_reason_other', 'withdrawal_reason_career'
    ],
                            axis=1,
                            inplace=True)
    # join the columns back into the original
    clean_enrollments = clean_enrollments.join(withdrawal_reasons)
    clean_enrollments['withdrawal_reason_motivational'] = np.nan
    clean_enrollments['withdrawal_reason_family'] = np.nan
    clean_enrollments['withdrawal_reason_health'] = np.nan
    clean_enrollments['withdrawal_reason_racial'] = np.nan

    # Drop invalid IPEDS id
    clean_enrollments['college_ncesid'] = clean_enrollments[
        'college_ncesid'].convert_objects(convert_numeric=True)
    clean_enrollments.loc[clean_enrollments['college_ncesid'] > 999999,
                          'college_ncesid'] = np.nan
    clean_enrollments['college_ncesid'] = clean_enrollments[
        'college_ncesid'].apply(int_with_NaN_tostr)

    clean_enrollments.drop(['transfer_reason__c', 'college_salesforce_id'],
                           axis=1,
                           inplace=True)
    clean_enrollments.rename(columns={
        'student_salesforce_id': 'kipp_nj_sf_id',
        'college_ncesid': 'ipedsid'
    },
                             inplace=True)
    return clean_enrollments