from dash_website.utils.aws_loader import load_csv, upload_file if __name__ == "__main__": squeezed_dimensions = (load_csv( f"page2_predictions/Performances/PERFORMANCES_bestmodels_alphabetical_instances_Age_test.csv" )[["organ", "view", "R-Squared_all", "R-Squared_sd_all"]].rename( columns={ "organ": "dimension", "view": "subdimension", "R-Squared_all": "r2", "R-Squared_sd_all": "r2_std" }).replace({ "ImmuneSystem": "BloodCells" }).set_index("dimension")) squeezed_dimensions.loc["Lungs", "subdimension"] = "*" squeezed_dimensions.loc["Hearing", "subdimension"] = "*" squeezed_dimensions.reset_index(inplace=True) squeezed_dimensions["squeezed_dimensions"] = squeezed_dimensions[ "dimension"] + squeezed_dimensions["subdimension"].replace("*", "") squeezed_dimensions.to_feather( "all_data/xwas/squeezed_dimensions_participant_and_time_of_examination.feather" ) upload_file( "all_data/xwas/squeezed_dimensions_participant_and_time_of_examination.feather", "xwas/squeezed_dimensions_participant_and_time_of_examination.feather", )
"Ethnicity.African", "Ethnicity.Black_Other", "Ethnicity.Chinese", "Ethnicity.Other", "Ethnicity.Other_ethnicity", "Ethnicity.Do_not_know", "Ethnicity.Prefer_not_to_answer", "Ethnicity.NA", ] if __name__ == "__main__": list_information = [] for chamber_type in [3, 4]: information_raw = load_csv( f"page12_AttentionMapsVideos/RawVideos/files/AttentionMaps-samples_Age_Heart_MRI_{chamber_type}chambersRawVideo.csv", usecols=columns_to_take, )[columns_to_take].set_index("id") information_raw.drop(index=information_raw[ information_raw["aging_rate"] != "normal"].index, inplace=True) information = pd.DataFrame( None, columns=[ "chamber", "sex", "age_group", "sample", "chronological_age", "biological_age", "ethnicity" ], index=information_raw.index, ) information["chamber"] = chamber_type
import pandas as pd from tqdm import tqdm from dash_website.utils.aws_loader import load_csv from dash_website import DIMENSIONS, ALL_CATEGORIES if __name__ == "__main__": for dimension in tqdm(DIMENSIONS): list_indexes = [] for category in ALL_CATEGORIES: for variable in load_csv( f"page5_LinearXWASResults/LinearOutput/linear_correlations_{category}_{dimension}.csv", usecols=["env_feature_name"], )["env_feature_name"].apply( lambda variable: variable.replace(".0", "")): list_indexes.append([category, variable]) indexes = pd.MultiIndex.from_tuples(list_indexes, names=["category", "variable"]) correlations = pd.DataFrame( None, columns=["p_value", "correlation", "sample_size"], index=indexes) for category in ALL_CATEGORIES: correlation_category_dimension = load_csv( f"page5_LinearXWASResults/LinearOutput/linear_correlations_{category}_{dimension}.csv", usecols=[ "env_feature_name", "p_val", "corr_value", "size_na_dropped"
"R-Squared_sd_all": "r2_std", } DATA_TYPE_NAMING = { "instances": "all_samples_per_participant", "eids": "average_per_participant" } ALGORITHMS_NAMING = { "ElasticNet": "elastic_net", "LightGBM": "light_gbm", "NeuralNetwork": "neural_network" } for data_type in ["instances"]: # ["eids", "instances"]: scores_raw = load_csv( f"page2_predictions/Performances/PERFORMANCES_tuned_alphabetical_{data_type}_Age_test.csv" ) scores_ = scores_raw[COLUMNS_TO_TAKE.keys()].rename( columns=COLUMNS_TO_TAKE) scores = scores_.replace(ALGORITHMS_NAMING).replace({ "ImmuneSystem": "BloodCells" }).reset_index(drop=True) scores.loc[(scores["dimension"] == "Musculoskeletal") & (scores["sub_subdimension"] == "MRI"), "sub_subdimension"] = "DXA" scores.to_feather( f"all_data/feature_importances/scores_{DATA_TYPE_NAMING[data_type]}.feather" )
"*": "all_samples_when_possible_otherwise_average", } COLUMNS_TO_TAKE = { "organ": "dimension", "view": "subdimension", "transformation": "sub_subdimension", "architecture": "algorithm", "R-Squared_all": "r2", "R-Squared_sd_all": "r2_std", } DICT_TO_CHANGE_DIMENSIONS = {"ImmuneSystem": "BloodCells"} if __name__ == "__main__": for sample_definition in ["instances", "eids"]: correlations_raw_ = load_csv( f"page4_correlations/ResidualsCorrelations/ResidualsCorrelations_{sample_definition}_Age_test.csv" ) correlations_std_raw_ = load_csv( f"page4_correlations/ResidualsCorrelations/ResidualsCorrelations_sd_{sample_definition}_Age_test.csv" ) correlations_raw = correlations_raw_.melt( id_vars=["Unnamed: 0"], value_vars=correlations_raw_.columns.drop("Unnamed: 0")) correlations_raw.rename(columns={ "Unnamed: 0": "dimensions_1", "variable": "dimensions_2", "value": "correlation" }, inplace=True)
"correlation": "Correlation", } if __name__ == "__main__": list_features = [] for category in tqdm( pd.Index(ALL_CATEGORIES).drop( ["Genetics", "Phenotypic", "PhysicalActivity"])): for dimension in DIMENSIONS: for algorithm in [ "elastic_net", "light_gbm", "neural_network", "correlation" ]: if "medical_diagnoses_" in category: features = load_csv( f"page18_MultivariateXWASFeatures/FeatureImp_{DICT_TO_FORMER_CATEGORIES.get(category, category)}_{DICT_TO_FORMER_DIMENSIONS.get(dimension, dimension)}_{DICT_TO_FORMER_ALGORITHM.get(algorithm, algorithm)}.csv" ).rename(columns={ "features": "variable", "weight": "feature_importance" }) else: features = load_csv( f"page18_MultivariateXWASFeatures/FeatureImp_Clusters_{DICT_TO_FORMER_CATEGORIES.get(category, category)}_{DICT_TO_FORMER_DIMENSIONS.get(dimension, dimension)}_{DICT_TO_FORMER_ALGORITHM.get(algorithm, algorithm)}.csv" ).rename(columns={ "features": "variable", "weight": "feature_importance" }) features["variable"] = features["variable"].apply( lambda variable: variable.split(".0")[0]) features["category"] = category features["dimension"] = dimension features["algorithm"] = algorithm
("PhysicalActivity", "FullWeek", "Scalars"): "PhysicalActivity", # ("Demographics", "All", "Scalars"): "Demographics", } if __name__ == "__main__": for dimension, subdimension, sub_subdimension in tqdm(DIMENSION_TO_NAME.keys()): list_colums = [] for algorithm in ["correlation", "elastic_net", "light_gbm", "neural_network"]: for observation in ["mean", "std"]: list_colums.append([algorithm, observation]) columns = pd.MultiIndex.from_tuples(list_colums, names=["algorithm", "observation"]) feature_for_index = load_csv( f"page3_featureImp/FeatureImp/FeatureImp_Age_{dimension}_{subdimension}_{sub_subdimension}_ElasticNet.csv" ).rename(columns={"features": "feature"}) feature_for_index["feature"] = ( feature_for_index["feature"].astype(str).apply(lambda feature: feature.split(".0")[0]) ) feature_for_index.drop(index=feature_for_index.index[feature_for_index["feature"].duplicated()], inplace=True) features = pd.DataFrame(None, columns=columns, index=feature_for_index["feature"]) features.index.name = "feature" for algorithm in ["correlation", "elastic_net", "light_gbm", "neural_network"]: mean_feature = load_csv( f"page3_featureImp/FeatureImp/FeatureImp_Age_{dimension}_{subdimension}_{sub_subdimension}_{ALGORITHM_NAMING[algorithm]}.csv" ).rename(columns={"features": "feature"}) mean_feature["feature"] = mean_feature["feature"].astype(str).apply(lambda feature: feature.split(".0")[0]) mean_feature.drop(index=mean_feature.index[mean_feature["feature"].duplicated()], inplace=True)
from dash_website.utils.aws_loader import load_csv COLUMNS_TO_TAKE = { "SNP": "SNP", "CHR": "chromosome", "Gene": "Gene", "Gene_type": "Gene_type", "P_BOLT_LMM_INF": "p_value", "BETA": "size_effect", "organ": "dimension", } if __name__ == "__main__": size_effects = load_csv( "page10_GWASResults/Volcano/GWAS_hits_Age_All_withGenes.csv" )[COLUMNS_TO_TAKE].rename(columns=COLUMNS_TO_TAKE) size_effects.replace( { "*instances1": "*instances1.5x", "ImmuneSystem": "BloodCells" }, inplace=True) size_effects.drop( index=size_effects[size_effects["dimension"] == "withGenes"].index, inplace=True) size_effects.reset_index( drop=True).to_feather("all_data/genetics/gwas/size_effects.feather")
pd.concat( (missing_scores, old_scores), ignore_index=True ).drop(columns="Unnamed: 0").to_csv( f"all_data/page7_MultivariateXWASResults/Scores/Scores_{algorithm}_test.csv" ) upload_file( f"all_data/page7_MultivariateXWASResults/Scores/Scores_{algorithm}_test.csv", f"page7_MultivariateXWASResults/Scores/Scores_{algorithm}_test.csv", ) list_scores = [] for algorithm in CAMEL_TO_SNAKE.keys(): scores = load_csv( f"page7_MultivariateXWASResults/Scores/Scores_{algorithm}_test.csv", index_col=0).drop(columns="subset") scores.rename(columns={ "env_dataset": "category", "organ": "dimension" }, inplace=True) scores_cleaned_dimension = scores.set_index("dimension").rename( index=DICT_TO_CHANGE_DIMENSIONS).reset_index() every_category = np.array( scores_cleaned_dimension["category"].tolist()) category_to_split = ~scores_cleaned_dimension[ "category"].str.startswith("medical_diagnoses")
"Claudification": "Claudication", } CAMEL_TO_SNAKE = { "ElasticNet": "elastic_net", "LightGbm": "light_gbm", "NeuralNetwork": "neural_network" } if __name__ == "__main__": list_correlations = [] for correlation_type in ["Pearson", "Spearman"]: for algorithm in ["ElasticNet", "LightGbm", "NeuralNetwork"]: correlations = load_csv( f"page8_MultivariateXWASCorrelations/CorrelationsMultivariate/CorrelationsMultivariate_{correlation_type}_{algorithm}.csv", index_col=0, ) correlations.rename( columns={ "env_dataset": "category", "organ_1": "dimension_1", "organ_2": "dimension_2", "corr": "correlation", "sample_size": "number_features", }, inplace=True, ) correlations.replace(DICT_TO_CHANGE_DIMENSIONS, inplace=True) correlations["category"] = list(
} DICT_TO_CHANGE_DIMENSIONS = { "ImmuneSystem": "BloodCells", "InceptionResNetV2": "inception_res_net_v2", "InceptionV3": "inception_v3", "ElasticNet": "elastic_net", "LightGBM": "light_gbm", "NeuralNetwork": "neural_network", "1DCNN": "1dcnn", "3DCNN": "3dcnn", } if __name__ == "__main__": for sample_definition in ["instances", "eids"]: scores = load_csv( f"page2_predictions/Performances/PERFORMANCES_withEnsembles_withCI_alphabetical_{sample_definition}_Age_test.csv" )[COLUMNS_TO_TAKE].rename(columns=COLUMNS_TO_TAKE) for metric in ["r2", "rmse", "c_index", "c_index_difference"]: scores[metric] = scores[f"{metric}_and_std"].str.split( "+", expand=True)[0].astype(np.float32) scores[f"{metric}_std"] = (scores[f"{metric}_and_std"].str.split( "+", expand=True)[1].str.split("-", expand=True)[1].astype(np.float32)) scores.drop(columns=f"{metric}_and_std", inplace=True) scores.loc[(scores["dimension"] == "Musculoskeletal") & (scores["sub_subdimension"] == "MRI"), "sub_subdimension"] = "DXA"
import pandas as pd from dash_website.utils.aws_loader import load_csv COLUMNS_TO_TAKE = {"organ": "dimension", "view": "subdimension", "R-Squared_all": "r2", "R-Squared_sd_all": "r2_std"} DICT_TO_CHANGE_DIMENSIONS = {"ImmuneSystem": "BloodCells"} if __name__ == "__main__": scores_raw = ( load_csv(f"page2_predictions/Performances/PERFORMANCES_bestmodels_alphabetical_instances_Age_test.csv")[ COLUMNS_TO_TAKE ] .rename(columns=COLUMNS_TO_TAKE) .set_index("dimension") ) ensembles_scores_raw = ( load_csv(f"page2_predictions/Performances/PERFORMANCES_withEnsembles_alphabetical_instances_Age_test.csv")[ COLUMNS_TO_TAKE ] .rename(columns=COLUMNS_TO_TAKE) .set_index(["dimension", "subdimension"]) ) ensembles_scores_raw["subdimension"] = ensembles_scores_raw.index.get_level_values("subdimension") for dimension_to_correct in ["Hearing", "Lungs"]: scores_raw.loc[dimension_to_correct, ["subdimension", "r2", "r2_std"]] = ensembles_scores_raw.loc[ (dimension_to_correct, "*"), ["subdimension", "r2", "r2_std"] ].values[0] scores = scores_raw.reset_index()
"Ethnicity.Chinese", "Ethnicity.Other", "Ethnicity.Other_ethnicity", "Ethnicity.Do_not_know", "Ethnicity.Prefer_not_to_answer", "Ethnicity.NA", ] if __name__ == "__main__": list_information = [] for DIMENSION in list(TREE_TIME_SERIES.keys()): for SUBDIMENSION in list(TREE_TIME_SERIES[DIMENSION].keys()): for SUB_SUBDIMENSION in TREE_TIME_SERIES[DIMENSION][SUBDIMENSION]: information_raw = load_csv( f"page9_AttentionMaps/Attention_maps_infos/AttentionMaps-samples_Age_{DIMENSION}_{SUBDIMENSION}_{SUB_SUBDIMENSION}.csv", usecols=columns_to_take, )[columns_to_take].set_index("id") information = pd.DataFrame( None, columns=[ "dimension", "subdimension", "sub_subdimension", "sex", "age_group", "aging_rate", "sample", "chronological_age", "biological_age", "ethnicity",
} SAMPLE_DEFINITION_NAMING = { "instances": "all_samples_per_participant", "eids": "average_per_participant", "*": "all_samples_when_possible_otherwise_average", } DICT_TO_CHANGE_DIMENSIONS = {"ImmuneSystem": "BloodCells"} if __name__ == "__main__": for sample_definition in ["instances", "eids"]: scores_raw = ( load_csv( f"page2_predictions/Performances/PERFORMANCES_bestmodels_alphabetical_{sample_definition}_Age_test.csv" )[COLUMNS_TO_TAKE] .rename(columns=COLUMNS_TO_TAKE) .set_index("dimension") ) ensembles_scores_raw = ( load_csv( f"page2_predictions/Performances/PERFORMANCES_withEnsembles_alphabetical_{sample_definition}_Age_test.csv" )[COLUMNS_TO_TAKE] .rename(columns=COLUMNS_TO_TAKE) .set_index(["dimension", "subdimension"]) ) ensembles_scores_raw["subdimension"] = ensembles_scores_raw.index.get_level_values("subdimension") if sample_definition == "instances": for dimension_to_correct in ["Hearing", "Lungs"]:
if __name__ == "__main__": for dimension, subdimension, sub_subdimension in tqdm( DIMENSION_TO_NAME.keys()): print(dimension, subdimension, sub_subdimension) name = DIMENSION_TO_NAME[(dimension, subdimension, sub_subdimension)] if dimension == "ImmuneSystem": new_dimension = "BloodCells" else: new_dimension = dimension if dimension != "PhysicalActivity": raw_scalars = load_csv( f"page1_biomarkers/BiomarkerDatasets/{name}_ethnicity.csv" ).set_index("id") else: raw_scalars = load_csv( f"page1_biomarkers/BiomarkerDatasets/{name}_short.csv" ).set_index("id") rename_columns = { "Sex": "sex", "Age when attended assessment centre": "chronological_age" } for feature in raw_scalars.columns[raw_scalars.columns.str.contains( ".0")]: rename_columns[feature] = feature.replace(".0", "")