def output(): string = """ test <- function(){ ## Toxicities: survival, feeding tube, aspiration # Outputs: predicted toxicity probability for each patient, # predicted survival probability for each patient, # weight of each variable in each risk prediction model ## Adds therapy as a predictor ## load libraries #library(survival) #library(dplyr) #library(purrr) if(!require("pacman")) install.packages("pacman") pacman::p_load(pacman, survival, tidyverse) ### set file directory ## Modify this to correct file location #file.dir <- "/users/shannonmck/Box/SMART/SMART database/Training set_version1.3/" ### load data #OPC <- read.csv(paste0(file.dir, # "Anonymized_644.Updated_cleaned_v1.3.1.csv")) OPC <- read.csv("newdata.csv") #length(unique(OPC$Dummy.ID)) ############################### ##### Clean clinical data ##### ############################### ## remove duplicates #take the values to a dataframe OPC_final <- OPC #takin first to forty colums OPC_final_clinic <- OPC_final[, c(1:40)] #making some new values using mutate OPC_final_clinic <- mutate(OPC_final_clinic, ajcc_stage = as.character(AJCC.8th.edition), #t3 and t4 will be 1 and everything will be 0 T.category34 = ifelse(T.category == "T3" | T.category == "T4", 1, 0), # n2 and n3 will be 1 and everything will be 0 N.category23 = ifelse(N.category == "N2" | N.category == "N3", 1, 0), # if white/caucasion will be white else all will be other white = ifelse(Race == "White/Caucasion", "White", "Other"), smoke = Smoking.status.at.Diagnosis..Never.Former.Current., # take that as numeric if value N/A make it not a number or NA pack_year = ifelse(Smoking.status..Packs.Year. == "N/A", NA, as.numeric(Smoking.status..Packs.Year.)), # if smoke is never then make 0 pack_year = ifelse(smoke == "Never", 0, pack_year), #for na took the mean of all pack_year = ifelse(is.na(pack_year) == TRUE, mean(pack_year, na.rm = TRUE), pack_year), age = Age.at.Diagnosis..Calculated., neck_boost = Neck.boost..Y.N., neck_dissection = Neck.Disssection.after.IMRT..Y.N., #changed the name of the value for ease i guess #made it as a factor meaning we can count how many are BOTs, Tonsils etc tumor_subsite = as.factor(ifelse(!(Tumor.subsite..BOT.Tonsil.Soft.Palate.Pharyngeal.wall.GPS.NOS.%in%c("BOT","Tonsil")),"Other", as.character(Tumor.subsite..BOT.Tonsil.Soft.Palate.Pharyngeal.wall.GPS.NOS.))) ) #write.csv(OPC_final_clinic, file="opc_final_clinic_after_mutation.csv") #renaming the values levels(OPC_final_clinic$smoke )[levels(OPC_final_clinic$smoke )=="Formar"] <- "Former" levels(OPC_final_clinic$T.category)[levels(OPC_final_clinic$T.category)=="Tis"] <- "T1" levels(OPC_final_clinic$T.category)[levels(OPC_final_clinic$T.category)=="Tx"] <- "T1" #write.csv(OPC_final_clinic, file="after_leveling.csv") ## 6 blanks for feeding tube - omit #removing feeding tube that does not have any value OPC_final_clinic <- OPC_final_clinic[OPC_final_clinic$Feeding.tube.6m!="",] #write.csv(OPC_final_clinic, file="omitting.csv") #as we removed some values above, we fixed the counter for therapeutic combination OPC_final_clinic$Therapeutic.combination <- droplevels(OPC_final_clinic$Therapeutic.combination) #write.csv(OPC_final_clinic, file="theraputicCombination.csv") OPC_final_clinic <- mutate(OPC_final_clinic, #if feedind tube is N then set 0 otherwise 1 feeding_tube = ifelse(Feeding.tube.6m =="N",0,1), #aspiration is N then set 0 otherwise 1 aspiration = ifelse(Aspiration.rate.Y.N.=="N",0,1), #neck_booxt, neck_dissection and hpv are made as factors for counting neck_boost = factor(OPC_final_clinic$neck_boost), neck_dissection = factor(OPC_final_clinic$neck_dissection), HPV.P16.status = factor(OPC_final_clinic$HPV.P16.status)) #levels(OPC_final_clinic$Therapeutic.combination) ## select covariates to include rel_vars <- c("Therapeutic.combination","Gender", "age", "HPV.P16.status", "T.category", "N.category", "smoke", "white", "pack_year","tumor_subsite","neck_boost","neck_dissection") ######################################## ##### Feeding tube: binary outcome ##### ######################################## ## Relevel factor variables to yield positive coefficients #releveling - making the IC+R as the first level OPC_final_clinic$Therapeutic.combination <- relevel(OPC_final_clinic$Therapeutic.combination,"IC+Radiation alone") #levels(OPC_final_clinic$Therapeutic.combination) #write.csv(OPC_final_clinic, file="feedingTubeBinaryOutcome.csv") #factoring gender OPC_final_clinic$Gender <- factor(OPC_final_clinic$Gender,levels=levels(factor(OPC_final_clinic$Gender))[2:1]) #print(factor(OPC_final_clinic$Gender,levels=levels(factor(OPC_final_clinic$Gender))[2:1])) #levels(OPC_final_clinic$neck_boost) #factoring smoke and then make Former as first level OPC_final_clinic$smoke <- factor(OPC_final_clinic$smoke) OPC_final_clinic$smoke <- relevel(OPC_final_clinic$smoke,"Former") #factoring white OPC_final_clinic$white <- factor(OPC_final_clinic$white,levels=c("White","Other")) #making Tonsile and Y as first levels OPC_final_clinic$tumor_subsite <- relevel(OPC_final_clinic$tumor_subsite,"Tonsil") OPC_final_clinic$neck_boost <- relevel(OPC_final_clinic$neck_boost,"Y") ## Logistic regression for feeding tube outcome ## Logistic regression model with main effects #paste0 concats values without any space in between them #collapse means what will add in between each results #creating the formula fmla_ft <- as.formula(paste0("feeding_tube ~",paste0(rel_vars,collapse="+"))) #print(fmla_ft) #applying generalized linear model #glm(formula, data, family) -- need to find the mathemetics for JS fit_ft <- glm(fmla_ft, data=OPC_final_clinic, family="binomial") ###################################### ##### Aspiration: binary outcome ##### ###################################### #releveling and factoring OPC_final_clinic$Therapeutic.combination <- relevel(OPC_final_clinic$Therapeutic.combination,"Radiation alone") OPC_final_clinic$white <- factor(OPC_final_clinic$white,levels=c("Other","White")) OPC_final_clinic$tumor_subsite <- relevel(OPC_final_clinic$tumor_subsite,"BOT") ## Logistic regression model with main effects fmla_asp <- as.formula(paste0("aspiration ~",paste0(rel_vars,collapse="+"))) fit_asp <- glm(fmla_asp,data=OPC_final_clinic,family="binomial") ## note high correlation between 2 binary outcomes ###################################### ##### Survival: censored outcome ##### ###################################### ### outcome # 1) overall - overall survival (OS) # 2) PFS - progression (local, regional, and distant control) free survival ## Add survival time and indicator variables to dataset for OS OPC_final_surv <- OPC_final_clinic OPC_final_surv <- mutate(OPC_final_surv, survtime = OS..Calculated., survind = 1 - Overall.Survival..1.alive..0.dead.) ## remove extraneous variables for analysis OPC_final_surv <- select(OPC_final_surv, Dummy.ID, rel_vars,survtime,survind) ## Relevel factor variables to yield positive coefficients OPC_final_surv$Therapeutic.combination <- relevel(OPC_final_surv$Therapeutic.combination,"IC+Radiation alone") OPC_final_surv$white <- factor(OPC_final_surv$white,levels=c("White","Other")) OPC_final_surv$Gender <- relevel(OPC_final_surv$Gender,"Female") OPC_final_surv$HPV.P16.status <- relevel(OPC_final_surv$HPV.P16.status,"Unknown") OPC_final_surv$smoke <- relevel(OPC_final_surv$smoke,"Never") OPC_final_surv$tumor_subsite <- relevel(OPC_final_surv$tumor_subsite,"BOT") OPC_final_surv$neck_dissection <- relevel(OPC_final_surv$neck_dissection,"N") OPC_final_surv$T.category <- factor(OPC_final_surv$T.category) levels(OPC_final_surv$T.category) <- levels(OPC_final_surv$T.category)[c(4:1)] OPC_final_surv$N.category <- factor(OPC_final_surv$N.category) OPC_final_surv$N.category <- relevel(OPC_final_surv$N.category,"N1") # Cox proportional hazards models - standard covariates #need to learn about coxph and basehaz and exp fmla_surv <- as.formula(paste0("Surv(survtime, survind) ~",paste0(rel_vars,collapse="+"))) fit_os <- coxph(fmla_surv, data=OPC_final_surv) ## baseline hazard #1. took the baseline hazards that have time less than 60 #2. then took the hazards from them #3. finally took the max h0_5yr <- max(basehaz(fit_os, centered=FALSE)[basehaz(fit_os)$time<=60,]$haz) #print(basehaz(fit_os, centered=FALSE)[basehaz(fit_os)$time<=60,]$haz) baseline_haz <- exp(-h0_5yr) ### design matrix #model matrix make the factor values as values in a matrix design.OPC <- data.frame(model.matrix(~Therapeutic.combination,data=OPC_final_surv)[,2:4], t(t(model.matrix(~Gender, data=OPC_final_surv)[,2])), OPC_final_surv$age, model.matrix(~HPV.P16.status+T.category+N.category+smoke+white, data=OPC_final_surv)[,2:12], OPC_final_surv$pack_year, model.matrix(~tumor_subsite+neck_boost+neck_dissection, data=OPC_final_surv)[,2:5]) #head(model.matrix(~Gender, data=OPC_final_surv)[,2]) #head(t(t(model.matrix(~Gender, data=OPC_final_surv)[,2]))) ## predictions for each participant # %*% is matrix multiplication # exp is exponential function preds_os <- exp(-h0_5yr)^exp((as.matrix(design.OPC))%*%(matrix(fit_os$coefficients))) #same process ## Repeat for progression-free survival outcome OPC_final_pfs <- OPC_final_clinic OPC_final_pfs <- mutate(OPC_final_clinic, #pmin sends the minimum value between the two survtime = pmin(Locoregional.control..Time., FDM..months.), survind = ifelse((Locoregional.control..Time. == survtime)* (1 - Locoregional.Control.1.Control.0.Failure.) + (FDM..months. == survtime)*(1 - Distant.Control..1.no.DM..0.DM.) > 0 , 1, 0)) OPC_final_pfs <- select(OPC_final_pfs, Dummy.ID, rel_vars,survtime,survind) ## Relevel factor variables to yield positive coefficients OPC_final_pfs$Therapeutic.combination <- relevel(OPC_final_pfs$Therapeutic.combination,"IC+Radiation alone") OPC_final_pfs$Gender <- relevel(OPC_final_pfs$Gender,"Female") OPC_final_pfs$white <- relevel(OPC_final_pfs$white,"White") OPC_final_pfs$HPV.P16.status <- relevel(OPC_final_pfs$HPV.P16.status,"Unknown") OPC_final_pfs$smoke <- relevel(OPC_final_pfs$smoke,"Never") OPC_final_pfs$tumor_subsite <- relevel(OPC_final_pfs$tumor_subsite,"Other") OPC_final_pfs$neck_dissection <- relevel(OPC_final_pfs$neck_dissection,"N") OPC_final_pfs$T.category <- factor(OPC_final_pfs$T.category) levels(OPC_final_pfs$T.category) <- levels(OPC_final_pfs$T.category)[c(4:1)] OPC_final_pfs$N.category <- factor(OPC_final_pfs$N.category) ## fit Cox proportional hazards model fit_pfs <- coxph(fmla_surv, data=OPC_final_pfs) ## baseline hazard h0_5yr_pfs <- max(basehaz(fit_pfs, centered=FALSE)[basehaz(fit_pfs)$time<=60,]$haz) baseline_haz_pfs <- exp(-h0_5yr_pfs) ### design matrix design.OPC_pfs <- data.frame(model.matrix(~Therapeutic.combination,data=OPC_final_pfs)[,2:4], t(t(model.matrix(~Gender, data=OPC_final_pfs)[,2])), OPC_final_pfs$age, model.matrix(~HPV.P16.status+T.category+N.category+smoke+white, data=OPC_final_pfs)[,2:12], OPC_final_pfs$pack_year, model.matrix(~tumor_subsite+neck_boost+neck_dissection, data=OPC_final_pfs)[,2:5]) ## predictions for each participant preds_pfs <- exp(-h0_5yr_pfs)^exp((as.matrix(design.OPC_pfs))%*%(matrix(fit_pfs$coefficients))) ########################## ##### Compile Output ##### ########################## final_preds <- data.frame(ID=OPC_final_clinic$Dummy.ID, feeding_tube_prob = fit_ft$fitted.values, aspiration_prob = fit_asp$fitted.values, overall_survival_5yr_prob = preds_os, progression_free_5yr_prob = preds_pfs) #write.csv(final_preds, file="Risk_preds.csv") weights <- list(data.frame(coef=fit_ft$coefficients[-1],var=names(fit_ft$coefficients[-1])), data.frame(coef=fit_asp$coefficients[-1],var=names(fit_asp$coefficients[-1])), data.frame(coef=fit_os$coefficients,var=names(fit_os$coefficients)), data.frame(coef=fit_pfs$coefficients,var=names(fit_pfs$coefficients))) final_weights <- reduce(weights,full_join,by="var") colnames(final_weights) <- c("feeding_tube_coef","variable","aspiration_coef", "overall_survival_5y4_coef","progression_free_5yr_coef") final_weights <- final_weights[order(final_weights$variable),] final_weights <- final_weights[,c(2,1,3:5)] rownames(final_weights) <- NULL return(final_preds) #write.csv(final_weights, file="Risk_pred_model_coefficients.csv") } """ powerpack = SignatureTranslatedAnonymousPackage(string, "powerpack") result = powerpack.test() print(result)