Exemple #1
0
def extract_patient_sample(raw_sample_df: DataFrame) -> DataFrame:
    patient_sample_df = raw_sample_df.select(
        "diagnosis",
        col("sample_id").alias("external_patient_sample_id"), "grade",
        "grading_system", "stage", "staging_system", "primary_site",
        "collection_site",
        init_cap_and_trim_all("prior_treatment").alias("prior_treatment"),
        "tumour_type",
        col("model_id").alias("model_name"))
    return patient_sample_df
Exemple #2
0
def clean_data_before_join(patient_df: DataFrame) -> DataFrame:
    patient_df = patient_df.withColumn("ethnicity",
                                       init_cap_and_trim_all("ethnicity"))
    return patient_df
def get_engraftment_site_from_model(raw_model_df: DataFrame) -> DataFrame:
    return raw_model_df.select(
        init_cap_and_trim_all("engraftment_site").alias("name"))
def get_tumour_type_from_sample(raw_sample_df: DataFrame) -> DataFrame:
    return raw_sample_df.select(
        init_cap_and_trim_all("tumour_type").alias("name"))
def get_engraftment_material_from_model(raw_model_df: DataFrame) -> DataFrame:
    return raw_model_df.select(init_cap_and_trim_all("sample_type").alias("name"))
Exemple #6
0
def clean_data_before_join(patient_sample_df: DataFrame) -> DataFrame:
    patient_sample_df = patient_sample_df.withColumn(
        "tumour_type", init_cap_and_trim_all("tumour_type"))
    return patient_sample_df
Exemple #7
0
def get_project_group_from_sharing(raw_sharing_df: DataFrame) -> DataFrame:
    return raw_sharing_df.select(init_cap_and_trim_all("project").alias("name")).where("project is not null")
Exemple #8
0
def get_ethnicity_from_patient(raw_patient_df: DataFrame) -> DataFrame:
    ethnicity_df = raw_patient_df.select(
        init_cap_and_trim_all("ethnicity").alias("name"))
    ethnicity_df = ethnicity_df.select("name").where("name is not null")
    ethnicity_df = ethnicity_df.drop_duplicates()
    return ethnicity_df
Exemple #9
0
def extract_model_validation(raw_model_validation_df: DataFrame) -> DataFrame:
    quality_assurance_df = raw_model_validation_df.withColumn(
        "validation_technique", init_cap_and_trim_all("validation_technique"))

    return quality_assurance_df