def transform_quality_assurance(raw_model_validation_df: DataFrame, model_df: DataFrame) -> DataFrame: quality_assurance_df = extract_model_validation(raw_model_validation_df) quality_assurance_df = set_fk_model(quality_assurance_df, model_df) quality_assurance_df = add_id(quality_assurance_df, "id") quality_assurance_df = get_columns_expected_order(quality_assurance_df) return quality_assurance_df
def transform_tissue(raw_sample_df: DataFrame) -> DataFrame: tissue_df = get_collection_site_from_sample(raw_sample_df).union( get_primary_type_from_sample(raw_sample_df)) tissue_df = tissue_df.drop_duplicates() tissue_df = add_id(tissue_df, "id") tissue_df = tissue_df.select("id", "name") return tissue_df
def transform_diagnosis(raw_patient_df: DataFrame, raw_sample_df: DataFrame) -> DataFrame: diagnosis_df = get_diagnosis_from_patient(raw_patient_df).union( get_diagnosis_from_sample(raw_sample_df)) diagnosis_df = diagnosis_df.drop_duplicates() diagnosis_df = add_id(diagnosis_df, "id") diagnosis_df = diagnosis_df.select("id", "name") return diagnosis_df
def transform_engraftment_sample_type(raw_model_df: DataFrame) -> DataFrame: engraftment_sample_type_df = get_engraftment_sample_type_from_model( raw_model_df) engraftment_sample_type_df = engraftment_sample_type_df.drop_duplicates() engraftment_sample_type_df = add_id(engraftment_sample_type_df, "id") engraftment_sample_type_df = engraftment_sample_type_df.select( "id", "name") return engraftment_sample_type_df
def transform_treatment(drug_dosing_df: DataFrame, patient_treatment_df: DataFrame) -> DataFrame: treatment_df = get_treatment_from_drug_dosing(drug_dosing_df).union( get_treatment_patient_treatment(patient_treatment_df) ) treatment_df = treatment_df.withColumn("name", lower_and_trim_all("name")) treatment_df = treatment_df.drop_duplicates() treatment_df = add_id(treatment_df, "id") treatment_df = treatment_df.select("id", "name") treatment_df.show() return treatment_df
def transform_patient_snapshot(raw_sample_df: DataFrame, patient_sample_df: DataFrame, patient_df: DataFrame) -> DataFrame: patient_snapshot_df = clean_data_before_join(raw_sample_df) patient_snapshot_df = set_fk_patient(patient_snapshot_df, patient_df) patient_snapshot_df = set_fk_patient_sample(patient_snapshot_df, patient_sample_df) patient_snapshot_df = add_id(patient_snapshot_df, "id") patient_snapshot_df = get_columns_expected_order(patient_snapshot_df) return patient_snapshot_df
def transform_patient(raw_patient_df: DataFrame, diagnosis_df: DataFrame, ethnicity_df: DataFrame, provider_group_df: DataFrame) -> DataFrame: patient_df = clean_data_before_join(raw_patient_df) patient_df = set_fk_diagnosis(patient_df, diagnosis_df) patient_df = set_fk_ethnicity(patient_df, ethnicity_df) patient_df = set_fk_provider_group(patient_df, provider_group_df) patient_df = set_external_id(patient_df) patient_df = add_id(patient_df, "id") patient_df = get_columns_expected_order(patient_df) return patient_df
def transform_patient_sample(raw_sample_df: DataFrame, diagnosis_df: DataFrame, tissue_df: DataFrame, tumour_type_df: DataFrame, model_df: DataFrame, raw_sample_platform_df: DataFrame) -> DataFrame: patient_sample_df = extract_patient_sample(raw_sample_df) patient_sample_df = clean_data_before_join(patient_sample_df) patient_sample_df = add_id(patient_sample_df, "id") patient_sample_df = set_fk_diagnosis(patient_sample_df, diagnosis_df) patient_sample_df = set_fk_origin_tissue(patient_sample_df, tissue_df) patient_sample_df = set_fk_sample_site(patient_sample_df, tissue_df) patient_sample_df = set_fk_tumour_type(patient_sample_df, tumour_type_df) patient_sample_df = set_fk_model(patient_sample_df, model_df) patient_sample_df = set_raw_data_url(patient_sample_df, raw_sample_platform_df) patient_sample_df = get_columns_expected_order(patient_sample_df) return patient_sample_df
def transform_provider_group( raw_sharing_df: DataFrame, raw_loader_df: DataFrame, provider_type_df: DataFrame) -> DataFrame: data_from_sharing_df = extract_data_sharing(raw_sharing_df) data_from_loader_df = extract_data_loader(raw_loader_df) provider_group_df = join_sharing_loader( data_from_sharing_df, data_from_loader_df) provider_group_df = set_fk_provider_type( provider_group_df, provider_type_df) provider_group_df = add_id(provider_group_df, "id") provider_group_df = get_columns_expected_order(provider_group_df) return provider_group_df
def transform_model(raw_model_df: DataFrame, raw_sharing_df: DataFrame, publication_group_df: DataFrame, accessibility_group_df: DataFrame, contact_people_df: DataFrame, contact_form_df: DataFrame, source_database_df: DataFrame) -> DataFrame: model_df = get_data_from_model(raw_model_df) model_df = join_model_with_sharing(model_df, raw_sharing_df) model_df = add_id(model_df, "id") model_df = set_fk_publication_group(model_df, publication_group_df) model_df = set_fk_accessibility_group(model_df, accessibility_group_df) model_df = set_fk_contact_people(model_df, contact_people_df) model_df = set_fk_contact_form(model_df, contact_form_df) model_df = set_fk_source_database(model_df, source_database_df) model_df = get_columns_expected_order(model_df) return model_df
def transform_source_database(raw_sharing_df: DataFrame) -> DataFrame: source_database_df = extract_source_database(raw_sharing_df) source_database_df = add_id(source_database_df, "id") source_database_df = get_columns_expected_order(source_database_df) return source_database_df
def transform_engraftment_site(raw_model_df: DataFrame) -> DataFrame: engraftment_site = get_engraftment_site_from_model(raw_model_df) engraftment_site = engraftment_site.drop_duplicates() engraftment_site = add_id(engraftment_site, "id") engraftment_site = engraftment_site.select("id", "name") return engraftment_site
def transform_tumour_type(raw_sample_df: DataFrame) -> DataFrame: tumour_type = get_tumour_type_from_sample(raw_sample_df) tumour_type = tumour_type.drop_duplicates() tumour_type = add_id(tumour_type, "id") tumour_type = tumour_type.select("id", "name") return tumour_type
def transform_engraftment_material(raw_model_df: DataFrame) -> DataFrame: engraftment_material = get_engraftment_material_from_model(raw_model_df) engraftment_material = engraftment_material.drop_duplicates() engraftment_material = add_id(engraftment_material, "id") engraftment_material = engraftment_material.select("id", "name") return engraftment_material
def transform_provider_group(raw_sharing_df: DataFrame) -> DataFrame: provider_type_df = get_provider_type_from_sharing(raw_sharing_df) provider_type_df = add_id(provider_type_df, "id") provider_type_df = get_columns_expected_order(provider_type_df) return provider_type_df
def transform_xenograft_sample(raw_sample_platform_df: DataFrame) -> DataFrame: xenograft_sample_df = get_xenograft_sample_from_sample_platform( raw_sample_platform_df) xenograft_sample_df = add_id(xenograft_sample_df, "id") xenograft_sample_df = get_columns_expected_order(xenograft_sample_df) return xenograft_sample_df
def transform_host_strain(raw_model_df: DataFrame) -> DataFrame: host_strain_df = extract_host_strain(raw_model_df) host_strain_df = add_id(host_strain_df, "id") host_strain_df = get_columns_expected_order(host_strain_df) return host_strain_df
def transform_accessibility_group(raw_sharing_df: DataFrame) -> DataFrame: accessibility_group_df = get_accessibility_group_from_sharing( raw_sharing_df) accessibility_group_df = add_id(accessibility_group_df, "id") accessibility_group_df = get_columns_expected_order(accessibility_group_df) return accessibility_group_df
def transform_ethnicity(raw_patient_df: DataFrame) -> DataFrame: ethnicity_df = get_ethnicity_from_patient(raw_patient_df) ethnicity_df = add_id(ethnicity_df, "id") ethnicity_df = get_columns_expected_order(ethnicity_df) return ethnicity_df
def transform_publication_group(raw_model_df: DataFrame) -> DataFrame: publication_group_df = extract_publications_from_models(raw_model_df) publication_group_df = add_id(publication_group_df, "id") publication_group_df = get_columns_expected_order(publication_group_df) return publication_group_df
def transform_project_group(raw_sharing_df: DataFrame) -> DataFrame: project_group = get_project_group_from_sharing(raw_sharing_df) project_group = project_group.drop_duplicates() project_group = add_id(project_group, "id") project_group = project_group.select("id", "name") return project_group
def transform_contact_form(raw_sharing_df: DataFrame) -> DataFrame: contact_form_df = extract_contact_form(raw_sharing_df) contact_form_df = add_id(contact_form_df, "id") contact_form_df = get_columns_expected_order(contact_form_df) return contact_form_df