def select_best_k_splits(sample, cov_name, a_logrank, cov_at_level, min_sub_size, mode): # returning values: # group = [ tuple(group1, desc1, group2, desc2, cov_name, p_value_split), ... ] def get_cov_values(samp, cov): return tuple(set(samp[cov])) def make_split(samp, cov, vals, mod): if cov == 'Sex': l, r = [j for j in xrange(len(samp[cov])) if samp[cov][j] == 1],\ [j for j in xrange(len(samp[cov])) if samp[cov][j] == 2] yield l, r, 'Male', 'Female' elif cov == 'Immun': if mod == desc.modes[0]: immun_b, immun_t = vals[:len(vals) / 2], vals[len(vals) / 2:] for k in xrange(1, 2 ** len(immun_b)): l, r = [], [] s = bin(k)[2:] if len(s) < len(immun_b): s = "".join(['0' for j in xrange(len(immun_b) - len(s))]) + s level = tuple([immun_b[j] for j in xrange(len(s)) if s[j] == '1']) not_level = tuple([immun_b[j] for j in xrange(len(s)) if s[j] == '0']) for j in xrange(len(samp[cov])): if samp[cov][j] in level: r.append(j) else: l.append(j) yield l, r, 'Immun-B in %s' % (str(level)), 'Immun-B in %s' % (str(not_level)) for k in xrange(1, 2 ** len(immun_t)): l, r = [], [] s = bin(k)[2:] if len(s) < len(immun_t): s = "".join(['0' for j in xrange(len(immun_t) - len(s))]) + s level = tuple([immun_t[j] for j in xrange(len(s)) if s[j] == '1']) not_level = tuple([immun_t[j] for j in xrange(len(s)) if s[j] == '0']) for j in xrange(len(samp[cov])): if samp[cov][j] in level: r.append(j) else: l.append(j) yield l, r, 'Immun-T in %s' % (str(level)), 'Immun-T in %s' % (str(not_level)) elif mod == desc.modes[1] or mode == desc.modes[2]: set_vals = set(vals) level = [{1, 2, 3, 4}, {10}, {12}, {13}, {5, 6, 7, 8}, {11}, {14, 15, 16, 17, 18, 19, 20}] not_level = [set_vals.difference(lvl) for lvl in level] for k in xrange(len(level)): l, r = [j for j in xrange(len(samp[cov])) if samp[cov][j] in not_level[k]],\ [j for j in xrange(len(samp[cov])) if samp[cov][j] in level[k]] # yield l, r, 'Immun in %s' % ('{' + str(not_level[k])[5:-2] + '}'),\ # 'Immun-B in %s' % ('{' + str(level[k])[5:-2] + '}') if k < 4\ # else 'Immun-T in %s' % ('{' + str(level[k])[5:-2] + '}') yield l, r, 'Immun',\ 'Immun-B in %s' % ('{' + str(level[k])[5:-2] + '}') if k < 4\ else 'Immun-T in %s' % ('{' + str(level[k])[5:-2] + '}') else: exit('make_split() error') elif cov == 'CNS': if mod == desc.modes[0]: l, r = [j for j in xrange(len(samp[cov])) if samp[cov][j] == 1],\ [j for j in xrange(len(samp[cov])) if samp[cov][j] == 2] yield l, r, 'CNS = 1', 'CNS = 2' elif mod == desc.modes[1] or mod == desc.modes[2]: for lvl in vals: l, r = [], [] for j in xrange(len(samp[cov])): if samp[cov][j] < lvl: l.append(j) else: r.append(j) level = [x for x in vals if x < lvl] not_level = [x for x in vals if x >= lvl] yield l, r, '%s = %s' % (cov, '{' + str(level)[1:-1] + '}'),\ '%s = %s' % (cov, '{' + str(not_level)[1:-1] + '}') else: exit('make_split() error') elif cov == 'Mediastinum': if mod == desc.modes[0] or mod == desc.modes[1]: l, r = [j for j in xrange(len(samp[cov])) if samp[cov][j] == 1],\ [j for j in xrange(len(samp[cov])) if samp[cov][j] == 2] yield l, r, 'Mediastinum = 1', 'Mediastinum = 2' elif mod == desc.modes[2]: for lvl in vals: l, r = [], [] for j in xrange(len(samp[cov])): if samp[cov][j] < lvl: l.append(j) else: r.append(j) level = [x for x in vals if x < lvl] not_level = [x for x in vals if x >= lvl] yield l, r, '%s = %s' % (cov, '{' + str(level)[1:-1] + '}'),\ '%s = %s' % (cov, '{' + str(not_level)[1:-1] + '}') else: exit('make_split() error') elif cov == 'Age' or cov == 'Leuc' or cov == 'Leber' or cov == 'Milz': for level in vals: l, r = [], [] for j in xrange(len(samp[cov])): if samp[cov][j] < level: l.append(j) else: r.append(j) yield l, r, '%s < %.2f' % (cov, level) if level != int(level) else '%s < %d' % (cov, level),\ '%s >= %.2f' % (cov, level) if level != int(level) else '%s < %d' % (cov, level) else: exit('make_split() error') def add_split(splts, other_splt): pos = -1 j = 0 while j < len(splts): if splts[j][-1] > other_splt[-1]: pos = j break j += 1 if pos == -1: splts.append(other_splt) else: splts.insert(pos, other_splt) return splts # def adjust_pval(p_split, num_splits): # return 1 - pow(1 - p_split, num_splits) treatment = sample[desc.covariates[0]] # Rand - protocol outcome = sample[desc.covariates[1]] # Tod - alive, lost, dead time = sample[desc.covariates[2]] # Time - lifetime treatment_type = sorted(list(set(treatment))) best_k_splits = [] cov_values = get_cov_values(sample, cov_name) if len(cov_values) == 1: return best_k_splits for left, right, desc_left, desc_right in make_split(sample, cov_name, cov_values, mode): if len(left) < min_sub_size or len(right) < min_sub_size: continue group1, group2 = sc.Subgroup(), sc.Subgroup() treat1, treat2 = [treatment[i] for i in left], [treatment[i] for i in right] out1, out2 = [outcome[i] for i in left], [outcome[i] for i in right] time1, time2 = [time[i] for i in left], [time[i] for i in right] convert_out1, convert_out2 = convert_data(out1), convert_data(out2) out11, t11, out12, t12 = sep_treats(convert_out1, time1, treat1, treatment_type) out21, t21, out22, t22 = sep_treats(convert_out2, time2, treat2, treatment_type) kmf11 = stat.kaplan_meier(out11, t11, treatment_type[0]) kmf12 = stat.kaplan_meier(out12, t12, treatment_type[1]) kmf21 = stat.kaplan_meier(out21, t21, treatment_type[0]) kmf22 = stat.kaplan_meier(out22, t22, treatment_type[1]) surv11 = stat.get_kmf_survival(kmf11) surv12 = stat.get_kmf_survival(kmf12) surv21 = stat.get_kmf_survival(kmf21) surv22 = stat.get_kmf_survival(kmf22) res1 = stat.logrank(out11, t11, out12, t12, alpha=a_logrank) res2 = stat.logrank(out21, t21, out22, t22, alpha=a_logrank) pow1 = stat.logrank_power(min(len(out11), len(out12)), surv11, surv12, alpha=a_logrank) pow2 = stat.logrank_power(min(len(out21), len(out22)), surv21, surv22, alpha=a_logrank) p_split = split_criterion(res1.test_statistic, res2.test_statistic) group1.set_subgroup(left, kmf11, treatment_type[0], kmf12, treatment_type[1], logrank=res1, pwr=pow1) group2.set_subgroup(right, kmf21, treatment_type[0], kmf22, treatment_type[1], logrank=res2, pwr=pow2) add_split(best_k_splits, (group1, desc_left, group2, desc_right, cov_name, p_split)) if len(best_k_splits) > cov_at_level: best_k_splits.pop() return best_k_splits
def make_subgroup_from_all(sample, a_logrank=desc.a_logrank): subgroup = sc.Subgroup() treatment = sample[desc.covariates[0]] # Rand - protocol outcome = sample[desc.covariates[1]] # Tod - alive, lost, dead time = sample[desc.covariates[2]] # Time - lifetime treatment_type = sorted(list(set(treatment))) convert_out = convert_data(outcome) out1, t1, out2, t2 = sep_treats(convert_out, time, treatment, treatment_type) kmf1 = stat.kaplan_meier(out1, t1, treatment_type[0]) kmf2 = stat.kaplan_meier(out2, t2, treatment_type[1]) surv1 = stat.get_kmf_survival(kmf1) surv2 = stat.get_kmf_survival(kmf2) res = stat.logrank(out1, t1, out2, t2, alpha=a_logrank) pow = stat.logrank_power(min(len(out1), len(out2)), surv1, surv2, alpha=a_logrank) subgroup.set_subgroup(range(len(sample)), kmf1, treatment_type[0], kmf2, treatment_type[1], logrank=res, pwr=pow) return subgroup