def process_center_days(): """ Cleans/Processes dataset Indicated columns are dropped Column names are cleaned As of column is added to indicate day change was recorded Training member is dropped Returns: DataFrame: cleaned dataframe Outputs: csv: processed data file """ center_days = pd.read_csv(f"{raw_data}\\center_days.csv") cols_to_drop = ["ParticipantName", "Center", "TimeAttending"] center_days.drop(cols_to_drop, axis=1, inplace=True) center_days.columns = clean_table_columns(center_days.columns) # create an as of column, so we can keep track of historic changes center_days["as_of"] = pd.to_datetime("today").date() assert len(set(center_days.columns)) == len(center_days.columns) center_days = center_days[center_days.member_id != 1003] center_days.to_csv(f"{processed_data}\\center_days.csv", index=False) return center_days
def load_utlization(path): """ Loads indicated csv into pandas DataFrame Cleans column names Facility names in vendor column are replaced with decided common names Removed training member Args: path(str): name of raw data csv in raw_data folder Returns: df(DataFrame): pandas dataframe """ df = pd.read_csv(f"{raw_data}\\{path}", parse_dates=["AdmissionDate"]) df.rename( columns={ "MemberID": "member_id", "LOSDays": "los", "FacilityName": "facility" }, inplace=True, ) df.columns = clean_table_columns(df.columns) facility_col = [col for col in df.columns if "facility" in col][0] df = cognify_facility_changes(df, facility_col) df = df[df.member_id != 1003] return df
def process_payments(): """ Cleans/Processes dataset Indicated columns are dropped Column names are cleaned Facility names in vendor column are replaced with decided common names Total paid column is made to floats from US currency Returns: DataFrame: cleaned dataframe Outputs: csv: processed data file """ payments = pd.read_csv( f"{raw_data}\\payments.csv", parse_dates=["DatePaid", "DateClaim", "ServiceDate", "ServiceDateTo"], low_memory=False, ) payments.rename( columns={ "ClaimID": "claim_id", "UB_Invoice": "ub_invoice", "AuthID": "auth_id", "DMEItem": "dme_item", "Check": "check_num", }, inplace=True, ) payments.columns = clean_table_columns(payments.columns) payments["total_paid"] = payments["total_paid"].apply(atof) hosp_replace = { "Roger Williams Med Center": "Roger Williams Medical Center", "Psych Our Lady of Fatima": "Our Lady of Fatima Hospital", "Our Lady of Fatima Hosp": "Our Lady of Fatima Hospital", "Our Lady of Fatima": "Our Lady of Fatima Hospital", "The Miriam Hospital Lab": "The Miriam Hospital", "Hosp The Miriam Hospital": "The Miriam Hospital", "Bayberry Commons": "Bayberry Commons Nursing & Rehabilitation Center", "Cedar Crest Nursing Centre": "Cedar Crest Nursing Center", "Berkshire Place, Ltd.": "Berkshire Place Nursing and Rehab", "Scandinavian Home Inc": "Scandinavian Home", } payments["vendor"].replace(hosp_replace, inplace=True) cols_to_drop = ["program", "center", "participant"] payments.drop(cols_to_drop, axis=1, inplace=True) payments.to_csv(f"{processed_data}\\payments.csv", index=False) return payments
def process_detail_claims(): """ Cleans/Processes dataset Indicated columns are dropped Column names are cleaned Facility names in vendor column are replaced with decided common names Returns: DataFrame: cleaned dataframe Outputs: csv: processed data file """ claims_detail = pd.read_csv( f"{raw_data}\\claims_detail.csv", parse_dates=[ "First DOS", "Last DOS", "Received Date", "InAccountingDate", "CheckDate", "ClaimLineCreatedDate", ], low_memory=False, ) claims_detail.columns = clean_table_columns(claims_detail.columns) hosp_replace = { "Roger Williams Med Center": "Roger Williams Medical Center", "Psych Our Lady of Fatima": "Our Lady of Fatima Hospital", "Our Lady of Fatima Hosp": "Our Lady of Fatima Hospital", "Our Lady of Fatima": "Our Lady of Fatima Hospital", "The Miriam Hospital Lab": "The Miriam Hospital", "Hosp The Miriam Hospital": "The Miriam Hospital", "Bayberry Commons": "Bayberry Commons Nursing & Rehabilitation Center", "Cedar Crest Nursing Centre": "Cedar Crest Nursing Center", "Berkshire Place, Ltd.": "Berkshire Place Nursing and Rehab", "Scandinavian Home Inc": "Scandinavian Home", } claims_detail["vendor"].replace(hosp_replace, inplace=True) # claims_detail = create_dx_desc_cols(claims_detail, detail=True) cols_to_drop = ["participant_name"] claims_detail.drop(cols_to_drop, axis=1, inplace=True) claims_detail.to_csv(f"{processed_data}\\claims_detail.csv", index=False) return claims_detail
def process_referrals(): """ Cleans/Processes dataset Indicated columns are dropped Column names are cleaned Close reason column is split into 3 columns; close_date close_type close_details Returns: DataFrame: cleaned dataframe Outputs: csv: processed data file """ referrals = pd.read_csv( f"{raw_data}\\referrals.csv", parse_dates=[ "ReferralDate", "IntakeVisit", "FirstVisitDay", "EnrollmentEffective", ], ) referrals.columns = clean_table_columns(referrals.columns) referrals[["close_date", "close_type"]] = referrals["close_reason"].str.split( ":", expand=True ) referrals[["close_type", "close_details"]] = referrals["close_type"].str.split( "- ", expand=True ) referrals.drop("close_reason", axis=1, inplace=True) referrals["close_type"] = referrals["close_type"].str.strip() referrals["close_details"] = referrals["close_details"].str.strip() referrals["close_date"] = pd.to_datetime(referrals["close_date"]) referrals["referral_source"].replace({"NOT SPECIFIED": np.nan}, inplace=True) assert len(set(referrals.columns)) == len(referrals.columns) referrals = referrals[referrals.member_id != 1003] referrals.to_csv(f"{processed_data}\\referrals.csv", index=False) return referrals
def process_demographics(): """ Cleans/Processes dataset Indicated columns are dropped Unknown race is replaced with Hispanic or Latino when that is the corresponding value in the ethnicity column Gender column is changed from Female/Male to 1/0 Training member is dropped Returns: DataFrame: cleaned dataframe Outputs: csv: processed data file """ demographics = pd.read_csv(f"{raw_data}\\demographics.csv") # fill race with Hispanic/Latino from ethnicity if unknown latino_mask = (demographics["ethnicity"] == "Hispanic or Latino") & ( demographics["race"] == "Unknown") demographics.at[latino_mask, "race"] = "Hispanic or Latino" # Shorten Other Race demographics["race"].replace({"Other Race": "Other"}, inplace=True) # drop ethnicity col demographics.drop("ethnicity", axis=1, inplace=True) # code Female/Male as 1/0 demographics["gender"] = demographics["gender"].str.strip() demographics["gender"].replace({"Female": 1, "Male": 0}, inplace=True) # create datetime col of date of birth column demographics["dob"] = pd.to_datetime(demographics["dob"]) # clean column names demographics.columns = clean_table_columns(demographics.columns) # insure no duplicate column names assert len(set(demographics.columns)) == len(demographics.columns) demographics = demographics[demographics.member_id != 1003] demographics.to_csv(f"{processed_data}\\demographics.csv", index=False) return demographics
def process_wounds(): """ Cleans/Processes dataset Indicated columns are dropped Column names are cleaned NA member_id rows are dropped Returns: DataFrame: cleaned dataframe Outputs: csv: processed data file """ wounds = pd.read_csv(f"{raw_data}\\wounds.csv", parse_dates=["Date Time Occurred", "Date Healed"]) cols_to_drop = ["Participant"] wounds.drop(cols_to_drop, axis=1, inplace=True) wounds.columns = clean_table_columns(wounds.columns) wounds.dropna(subset=["member_id"], inplace=True) wounds["member_id"] = wounds["member_id"].astype(int) wounds.to_csv(f"{processed_data}\\wounds.csv", index=False) return wounds
def process_authorizations(): """ Cleans/Processes dataset Indicated columns are dropped Approval Expiration Dates with year 9999 are replaced with an empty string Column names are cleaned Auth and Service codes are replaced with descriptions Referring provider first and last name are merged into one column Referred to provider first and last name are merged into one column Leading and trailing spaces are stripped from vendor column Indicated columns are dropped NA member values are dropped Training member is dropped Returns: DataFrame: cleaned dataframe Outputs: csv: processed data file """ auths = pd.read_csv( f"{raw_data}\\auths.csv", parse_dates=[ "EnteredDate", "ModifiedDate", "ApprovalEffectiveDate", "ExpectedServiceDate", ], ) drop_cols = [ "Source", "EnteredBy", "ModifiedBy", "ParticipantFirstName", "ParticipantLastName", "ReferredToProviderOfService", "Hours", "Frequency", "LengthOfStay", ] auths.drop(drop_cols, axis=1, inplace=True) replace_outside_dates = {} for outside_date in auths["ApprovalExpirationDate"].unique(): if "9999" in outside_date: replace_outside_dates[outside_date] = "" auths.dropna(subset=["MemberId"], inplace=True) auths["MemberId"] = auths["MemberId"].astype(int) auths["ApprovalExpirationDate"].replace(replace_outside_dates, inplace=True) auths["ApprovalExpirationDate"] = pd.to_datetime( auths["ApprovalExpirationDate"]) auth_types = { "DM": "DME", "DY": "Dialysis", "ER": "Emergency Room", "HH": "Home Care", "HO": "Hospice", "IN": "Inpatient", "OP": "Outpatient", "OT": "Other", "PR": "Personal Emergency Response", "TR": "Transportation", } service_types = { "OR20": "Podiatry", "DE10": "Dental", "XR10": "Radiology (General)", "CR11": "Cardiology", "VS10": "Ophthalmology", "UR10": "Urology", "VS20": "Optometry", "OA10": "Other", "XR40": "Radiology Mammography", "GA10": "Gastroenterology", "GV10": "Office Visit", "SG10": "Surgeon", "NE10": "Nephrology", "PH80": "Hematology/Oncology", "OR10": "Orthopedics", "XR30": "Radiology - Cat Scan", "BH10": "Psychiatry", "VS50": "Vision Hardware", "XR50": "Radiology - MRI", "XR90": "Radiology - Ultrasound", "DM10": "DME - Purchase", "SK10": "Dermatology", "NE20": "Neurology", "OU20": "Radiology Tests", "PC20": "Primary Care", "DE50": "Dental Hardware", "BH30": "Psychology", "WO20": "Wounds Care", "EM10": "Emergency Dept Visit", "EM40": "Observation Management", "IN10": "Inpatient Hospital", "XR55": "Outpatient MRI/CT", "SG20": "Outpatient Surgery", "VS30": "Ophthalmology - Special Services", "BH11": "Inpatient Psychiatric Unit/Facility", "SG50": "Surgery Services", "PH70": "Phys. Svc. - Inpatient/Outpatient Medical Special", "PC10": "Phys. Svc. - Other", "XR60": "Radiology - Diagnostic Nuclear Medicine", "XR80": "Radiation Therapy", "BH20": "Health and Behavior Assessment", "IN20": "Inpatient Medical Specialist", "CA20": "Cancer Center", "PU20": "Phys. Svc. - Pulmonology", "LB10": "Pathology and Laboratory", "RD10": "Chemotherapy", "SP20": "In-Home Medical Supplies", "DM00": "Oxygen", "DM15": "DME - Rental", "SP30": "Medical Supplies General", "AU20": "Phys. Svc. - Audiology", "AU50": "Speech & Hearing Hardware", "DY10": "Dialysis", "SN10": "Phys. Svc. - SNF", "CR10": "Cardiovascular", "NU20": "Nurse Practioner", "HHRN": "Home Health RN Services", "HO10": "Hospice Services", "HO20": "Hospice - Hospital or LTC", "DM60": "Orthotic Procedures and Devices", "DM30": "Prosthetic Procedures and Devices", "AU10": "Otorhinolaryngologic Services", "HM40": "Housekeeping and Chore Services", "GY10": "Phys. Svc. - Gynecology", "RD20": "EKG", "PS10": "Purchase Service Other", "HM20": "In-Home - Personal Care/Home Chore", "HH53": "In-Home Supportive Care", "HM30": "Chore Services", "HH20": "Personal Care Assistant - PCA", "TR10": "Emergency Transportation", "HH50": "Home Health Aide - HHA", "AL10": "Assisted Living - Permanent Placement", "NH60": "Nursing Home - Skilled Nursing", "NH30": "Nursing Home - Permanent Placement", "NH50": "Nursing Home - Respite/Temporary", "NH20": "Nursing Home - Inpatient", "ME10": "Meals & Distribution", "HH51": "In-Home - Supportive Care", "TR50": "Non Emergent Transportation", "PC11": "Physician Fees - Other", "PT20": "Physical Therapy - Normal (Non-In Home)", "AD15": "Adult Day Center Attendance", "PR20": "Prescription Medications", "PR10": "Personal Alarm/Response System", "TR60": "Transportation - Other", "AL20": "Assisted Living - Respite/Temporary", "PT10": "Physical Therapy - In Home", "OT10": "Occupational Therap - In Home", "EM30": "Phys. Svc - Emergency Room", "ME20": "Meals - In Home", "TR70": "Transportation - Ambulette", "HH80": "Home Health Other", "SP40": "Medical Supplies General (House Stock)", "GV20": "Phys. Svc - Admin", "HH30": "Homemaker", "AN10": "Phys. Svc - Anesthesiology", "MS10": "Medical Services", "MH10": "Purchase Service MH/MR", "AL30": "Assisted Living - Observation", "ET10": "Enteral and Parenteral Therapy", "RBMS": "R&B Medical/Surgical", "PY10": "Psychotherapy", "EM20": "Emergency Room Out of Area", } auths.columns = clean_table_columns(auths.columns) auths["authorization_type"].replace(auth_types, inplace=True) auths["service_type"].replace(service_types, inplace=True) auths["referring_provider"] = ( auths["referring_provider_first_name"].fillna("") + " " + auths["referring_provider_last_name"].fillna("")) auths["vendor"] = (auths["referred_to_provider_first_name"].fillna("") + " " + auths["referred_to_provider_last_name"].fillna("")) auths["vendor"] = auths["vendor"].str.strip() combined_cols_to_drop = [ "referring_provider_first_name", "referring_provider_last_name", "referred_to_provider_first_name", "referred_to_provider_last_name", ] auths.drop(combined_cols_to_drop, axis=1, inplace=True) auths.dropna(subset=["member_id"], inplace=True) auths = auths[auths.member_id != 1003] auths.reset_index(drop=True, inplace=True) auths.to_csv(f"{processed_data}\\auths.csv", index=False)
def process_medications(): """ Cleans/Processes dataset Indicated columns are dropped Column names are cleaned Meds are grouped by member and description and the minimum start date is found and the maximum start date The discontinue date is filled in with either the estimated discontinue date or the date of the more recent script The resulting dataframe has a column indicating if a med is active a column for the create date of the script one for the most recent script and one to indicate the discontinue date Returns: DataFrame: cleaned dataframe Outputs: csv: processed data file """ meds = pd.read_csv( f"{raw_data}\\meds.csv", parse_dates=[ "start_date", "discontinue_date", "estimated_end_date", "create_date", ], ) helpers = Helpers(database_path) enroll = helpers.dataframe_query( q="SELECT member_id, disenrollment_date FROM enrollment", params=None) meds.columns = clean_table_columns(meds.columns) df = (meds.groupby(["member_id", "desc"]).min().reset_index()[[ "member_id", "desc", "name", "start_date", "create_date" ]]) df[["discontinue_date", "estimated_end_date", "most_recent_script" ]] = (meds.groupby(["member_id", "desc"]).max().reset_index()[[ "discontinue_date", "estimated_end_date", "start_date" ]]) status = meds[[ "member_id", "start_date", "status", "class", "discontinue_type" ]].copy() status.rename(columns={"start_date": "date"}, inplace=True) dff = df.merge( status, left_on=["member_id", "most_recent_script"], right_on=["member_id", "date"], how="left", ) dff.drop("date", axis=1, inplace=True) dff.drop_duplicates(inplace=True) dff.reset_index(drop=True, inplace=True) dff = dff.merge(enroll, on="member_id") dff["discontinue_date"] = np.where( ((dff.discontinue_date < dff.most_recent_script) & (dff.status != "Active")), dff["estimated_end_date"], dff["discontinue_date"], ) dff["discontinue_date"] = np.where( ((dff.discontinue_date < dff.most_recent_script) & (dff.status != "Active")), dff["most_recent_script"], dff["discontinue_date"], ) dff["discontinue_date"] = np.where( ((dff.discontinue_date.notnull()) & (dff.status == "Active")), pd.NaT, dff["discontinue_date"], ) dff["discontinue_date"] = np.where( ((dff.discontinue_date.isnull()) & (dff.status != "Active")), dff["most_recent_script"], dff["discontinue_date"], ) dff["discontinue_date"] = np.where( ((dff.discontinue_date.isnull()) & (dff.status != "Active")), dff["disenrollment_date"], dff["discontinue_date"], ) dff["discontinue_date"] = np.where( ((dff.discontinue_date.isnull()) & (dff.status != "Active") & dff.start_date.isnull()), dff["create_date"], dff["discontinue_date"], ) dff["start_date"] = np.where(dff.start_date.isnull(), dff["create_date"], dff["start_date"]) dff.drop(["disenrollment_date", "create_date"], axis=1, inplace=True) dff = code_y_n(dff) df.dropna(axis=1, how="all", inplace=True) dff["start_date"] = pd.to_datetime(dff["start_date"]) dff["discontinue_date"] = pd.to_datetime(dff["discontinue_date"]) dff.reset_index(inplace=True, drop=True) final_df = dff[dff.member_id != 1003].copy() assert (final_df[(final_df["status"] == "Active") & (final_df["discontinue_date"].notnull())].shape[0] == 0) assert (final_df[(final_df["status"] != "Active") & (final_df["discontinue_date"].isnull())].shape[0] == 0) assert final_df[(final_df["start_date"].isnull())].shape[0] == 0 final_df = final_df.sort_values("discontinue_date") final_df.drop_duplicates(subset=["member_id", "desc"], inplace=True) final_df.to_csv(f"{processed_data}\\meds.csv", index=False) return df
def process_alfs(): """ Cleans/Processes dataset Filtered for Assisted Living admissions Column names are cleaned Indicated columns are dropped Member ID column is created Discharge type is split into type and facility columns Training member is dropped Returns: DataFrame: cleaned dataframe Outputs: csv: processed data file """ alfs = pd.read_csv(f"{raw_data}\\alfs.csv", parse_dates=["AdmissionDate", "DischargeDate"]) alfs = alfs[(alfs["FacilityType"] == "Assisted Living - Permanent") | (alfs["FacilityType"] == "Assisted Living - Respite")].copy() alfs.columns = clean_table_columns(alfs.columns) drop_cols = ["observation_only", "participant_name"] alfs["member_id"] = alfs["participant_name"].apply(get_id) if (alfs.shape[0] != 0) & (alfs["discharge_type"].isnull().sum() != alfs["discharge_type"].shape[0]): try: alfs[[ "discharge_type", "discharge_facility", "discharge_facility2" ]] = alfs["discharge_type"].str.split(" - ", expand=True) except ValueError: alfs[["discharge_type", "discharge_facility" ]] = alfs["discharge_type"].str.split(" - ", expand=True) alfs["discharge_facility2"] = np.nan alfs["discharge_type"] = np.where( alfs["discharge_facility"].isin( ["Respite", "Permanent", "Skilled"]), alfs["discharge_type"] + " " + alfs["discharge_facility"], alfs["discharge_type"], ) alfs["discharge_facility"] = np.where( alfs["discharge_facility"].isin( ["Respite", "Permanent", "Skilled"]), alfs["discharge_facility2"], alfs["discharge_facility"], ) alfs.drop(["discharge_facility2"], axis=1, inplace=True) alfs.drop(drop_cols, axis=1, inplace=True) alfs = alfs[alfs.member_id != 1003] alfs.to_csv(f"{processed_data}\\alfs.csv", index=False)
def process_enrollment(): """ Cleans/Processes dataset Indicated columns are dropped Column names are cleaned Name column is split into last and first name columns Medicare and Medicaid numbers are replaced with 1s empty IDs are replaced with 0 Disenrolled reason is split into type and reason and cleaned Training member is dropped Ppt data (first, last) are merged with team data and saved apart from enrollment file Returns: DataFrame: cleaned dataframe Outputs: csv: processed data files """ enrollment = pd.read_csv( f"{raw_data}\\enrollment.csv", parse_dates=["EnrollmentDate", "DisenrollmentDate"], ) cols_to_drop = ["ParticipantName", "Gender", "SSN", "DeathDate"] # create first and last name cols enrollment[["last", "first"]] = enrollment["ParticipantName"].str.split( ",", expand=True ) # drop information that is in other tables/cols # or not needed (SSN) enrollment.drop(cols_to_drop, axis=1, inplace=True) enrollment.columns = clean_table_columns(enrollment.columns) enrollment.rename(columns={"other": "disenroll_reason"}, inplace=True) # code medicare/medicaid as 1 for has 0 for does not enrollment["medicare"] = np.where(enrollment["medicare"].notnull(), 1, 0) enrollment["medicaid"] = np.where(enrollment["medicaid"].notnull(), 1, 0) # disenroll_reasons begins with the type (voluntary/non) # Split that info out in to a new column enrollment["disenroll_type"] = ( enrollment["disenroll_reason"] .astype(str) .str.split(" ", expand=True)[0] .replace("", "") ) enrollment["disenroll_reason"] = enrollment["disenroll_reason"].apply( lambda x: " ".join(str(x).split(" ")[1:]) ) # dissatisfied with is implied in all of these reasons enrollment["disenroll_reason"] = ( enrollment["disenroll_reason"].astype(str).str.replace("Dissatisfied with ", "") ) enrollment["disenroll_type"] = enrollment["disenroll_type"].astype(str).str.title() enrollment["disenroll_reason"] = ( enrollment["disenroll_reason"].astype(str).str.title() ) assert len(set(enrollment.columns)) == len(enrollment.columns) enrollment = enrollment[enrollment.member_id != 1003] ppts = enrollment[["member_id", "last", "first"]].copy() centers = enrollment[ ["member_id", "center", "enrollment_date", "disenrollment_date"] ].copy() centers.rename( columns={"enrollment_date": "start_date", "disenrollment_date": "end_date"}, inplace=True, ) try: transfers = pd.read_csv( f"{raw_data}\\transfers.csv", parse_dates=["TransferDate"] ) transfers.columns = clean_table_columns(transfers.columns) transfers.rename(columns={"transfer_date": "start_date"}, inplace=True) transfers["end_date"] = np.nan transfers["old_center"] = transfers["comment"].str.split().str[4] transfers.drop( ["text_box5", "pariticipant_name", "comment"], axis=1, inplace=True ) centers = centers.append(transfers, sort=True) except ValueError: centers["old_center"] = None centers = centers[["member_id", "center", "start_date", "end_date", "old_center"]] ppts.drop_duplicates(subset=["member_id"], inplace=True) enrollment.to_csv(f"{processed_data}\\enrollment_for_census.csv", index=False) enrollment.drop(["last", "first"], axis=1, inplace=True) centers.to_csv(f"{processed_data}\\centers.csv", index=False) enrollment.to_csv(f"{processed_data}\\enrollment.csv", index=False) ppts.to_csv(f"{processed_data}\\ppts.csv", index=False) return enrollment
def process_incidents(df, cols_to_drop, incident_name, break_location=False): """ Cleans/Processes dataset Indicated columns are dropped Column names are cleaned NA rows are dropped Yes/No Columns are changed to 1/0 If there is a location column: Location column is broken up into a location and a location details column. Replacements are made to standardize data. Training member is dropped Args: df(DataFrame): pandas dataframe to clean cols_to_drop(list): list of columns to drop incident_name(str): name of incident to save cleaned file as break_location(bool): indicates if the dataframe has a location column that can be broken up in the format "location-location detail". Returns: DataFrame: cleaned dataframe Outputs: csv: processed data file """ df.drop(cols_to_drop, axis=1, inplace=True) df.columns = clean_table_columns(df.columns) df = code_y_n(df) df.dropna(axis=1, how="all", inplace=True) df["date_time_occurred"] = pd.to_datetime(df["date_time_occurred"]) try: df["date_discovered"] = pd.to_datetime(df["date_discovered"]) except KeyError: pass if break_location: df["location_details"] = df["location"].str.split(" - ", expand=True)[1] df["location"] = df["location"].str.split(" - ", expand=True)[0] location_replacements = { "Participant": "", "PACE Center": "PACE", "Nursing Facility": "NF", "Assisted Living Facility": "ALF", } df["location"].replace(location_replacements, inplace=True) df.reset_index(inplace=True, drop=True) df["incident_id"] = create_id_col(df, ["member_id", "date_time_occurred"], "incident_id") assert len(set(df.columns)) == len(df.columns) df = df[df.member_id != 1003] df.to_csv(f"{processed_data}\\{incident_name}.csv", index=False) return df
def process_grievances(update=True): ### NOT USED CURRENTLY grievances = pd.read_csv(f"{raw_data}\\grievances.csv") if not update: prev_griev = pd.read_csv(f"{raw_data}\\grievances_prev.csv") grievances.append(prev_griev, sort=False) # works with Pauline's grid # looks for where she indicates the providers/types start # this is in the current first row provider_start = grievances.columns.tolist().index("Provider") provider_end_type_start = grievances.columns.tolist().index("TYPE") # type_end = df.columns.tolist().index("EOT") type_end = (grievances.iloc[0].values.tolist().index( "full resolution for internal tracking")) # actual col names are in the second row grievances.columns = [ col for col in grievances.iloc[1].values.tolist() if type(col) != float ] grievances.drop([0, 1], inplace=True) grievances.reset_index(drop=True, inplace=True) # fix one column that needs the long title for others that use the grid grievances.rename( columns={ "grievance # (if highlighted, indicates letter was sent)": "grievance_num" }, inplace=True, ) # fix some odd formatting in the col names - should be able to remove soon # get cols that indicate if a grievances is attributed to # a specific provider providers = grievances.columns[ provider_start:provider_end_type_start].tolist() # or a specific type types = grievances.columns[provider_end_type_start:type_end].tolist() grievances.columns = [ col.lower() if col in providers or col in types else col for col in grievances.columns ] grievances.dropna(axis=1, how="all", inplace=True) grievances.columns = clean_table_columns(grievances.columns) grievances.rename(columns={"participant_id": "member_id"}, inplace=True) grievances["grievance_num"] = grievances["grievance_num"].str.split( "-", expand=True)[0] providers = grievances.columns[ provider_start:provider_end_type_start].tolist() types = grievances.columns[provider_end_type_start:type_end].tolist() # create column that indicates the name of the provider # the grievance is attributed to grievances = create_indicator_col(grievances, "providers", providers) # create column that indicates the has the type of each grievance grievances = create_indicator_col(grievances, "types", types) grievances["category_of_the_grievance"] = np.where( grievances["category_of_the_grievance"].str.contains("Contracted"), "Contracted Facility", grievances["category_of_the_grievance"], ) grievances["description_of_the_grievance"] = np.where( grievances["description_of_the_grievance"].str.contains("Other"), "Other", grievances["description_of_the_grievance"], ) # turn quality analysis col to binary 1/0 for Y/N grievances["quality_analysis"].replace(["Y", "N"], [1, 0], inplace=True) # create datetime cols grievances["date_grievance_received"] = pd.to_datetime( grievances["date_grievance_received"]) grievances["date_of_resolution"] = pd.to_datetime( grievances["date_of_resolution"]) grievances["date_of_oral_notification"] = pd.to_datetime( grievances["date_of_oral_notification"]) grievances["date_of_written_notification"] = pd.to_datetime( grievances["date_of_written_notification"]) # drop cols that are not needed, includes all the provider # and types cols that have been essentially "un" one hot encoded col_to_drop = ([ "participant_first_name", "participant_last_name", "year_and_qtr_received", "quarter_reported", "notes", ] + providers + types) grievances.drop(col_to_drop, axis=1, inplace=True) grievances.dropna(subset=["member_id", "date_grievance_received"], inplace=True) grievances["griev_id"] = create_id_col( grievances, ["member_id", "date_grievance_received"], "griev_id") grievances.to_csv(f"{processed_data}\\grievances.csv", index=False) return grievances