def test_groupped_test(with_small, overwrite, outputfile='Xdata.csv'): if exists(outputfile) and (overwrite == False): return global best_dir modelfile = join(best_dir, 'STEP3_CVRES.json') assert exists(modelfile) modeldata = json.load(open(modelfile)) coefdict = modeldata['model']['data']['coef(min)'] coefs = coefdict.keys() coefs.remove('(Intercept)') monocoef = [] for c in coefs: words = c.split('*') monocoef += words print(monocoef) sys.path.append(datadir) import feature_sets codelist = feature_sets.codelist features = [] for code in codelist: features.append(feature_db[code]) pass # Prepare training data X = collect_feature(features, with_small=with_small, mode='train') Xcolumns = Set(X.columns.values.tolist()) linear_features = list(Xcolumns.intersection(Set(monocoef))) X = expand_poly2(X, cols=linear_features) # Now, Xcolumns containss quadratic terms. We also need extra codes for # matching 'small feature sets'. Xcolumns = Set(X.columns.values.tolist()) #all_coefnames = Set(monocoef + combicoef) all_coefnames = Set(coefs) all_coefnames_i = all_coefnames.intersection(Xcolumns) X = X[list(all_coefnames_i)] X.to_csv(outputfile)
def mixedmodel(feature_codes, label, coefmode='coef(min)', polyexp=False, selected_coefs=[], with_small=False, overwrite=False, train_ratio=0.7, nfolds=5, nrepeats=0, threshold=1000, with_randomeffect=False, alpha=1.0, ncores=1, clear=False): model_json = label + '.json' xDatafileTrain = label + '_xData.csv' yDatafile = label + '_yData.csv' xDatafileAns = label + '_xDataAns.csv' predfile = label + '_PRED.csv' predfileAns = label + '_PREDL.csv' obsfile = label + '_OBSERV.csv' if exists(obsfile) and (overwrite==False): return therapyfileTrain = pydream2015.DATA_COMBITHERAPY dfTherapy = pd.read_csv(therapyfileTrain) therapyfileAns = pydream2015.DATA_COMBITHERAPY_LEADER features = [] for code in feature_codes: features.append( feature_db[code] ) pass # training data 준비하기 X = collect_feature(features, with_small=with_small, mode='train') if polyexp and with_small: selected_coefs = selected_coefs[0:5] pass if polyexp: X = expand_poly2(X, cols=selected_coefs) else: pass X.to_csv(xDatafileTrain, index=False) Xans = collect_feature(features, with_small=with_small, mode='leader') if polyexp: Xans = expand_poly2(Xans, cols=selected_coefs) else: pass Xans.to_csv(xDatafileAns, index=False) print 'num_samples: %d, num_features: %d' % (X.shape[0], X.shape[1]) if with_randomeffect: random_effect = {} random_effect['mean'] = {} random_effect['std'] = {} #therapy_groups = dfTherapy.groupby('CELL_LINE').groups therapy_groups = dfTherapy.groupby(SELECTED_RANDOMEFFECT).groups for cell in therapy_groups: ids = therapy_groups[cell] syn_mean = dfTherapy.loc[ids,'SYNERGY_SCORE'].mean() syn_std = dfTherapy.loc[ids,'SYNERGY_SCORE'].std() random_effect['mean'][cell] = syn_mean random_effect['std'][cell] = syn_std pass for i in dfTherapy.index: cell = dfTherapy.loc[i, SELECTED_RANDOMEFFECT] dfTherapy.loc[i, 'SYNERGY_SCORE'] -= random_effect['mean'][cell] pass else: random_effect = None pass y = dfTherapy[['SYNERGY_SCORE']] y.to_csv(yDatafile, index=False) model_training(xDatafileTrain, yDatafile, model_json, polyexp=False, with_small=with_small, overwrite=overwrite, train_ratio=train_ratio, nfolds=nfolds, nrepeats=nrepeats, threshold=threshold, with_randomeffect=with_randomeffect, random_effect=random_effect, alpha=alpha,ncores=ncores) # test and scoring with training data set: model_test(model_json, therapyfileTrain, xDatafileTrain, predfile, coef_mode=coefmode, overwrite=overwrite) model_scoring(model_json, therapyfileTrain, obsfile, predfile, coef_mode=coefmode) # answer with leaderboard data set: model_test(model_json, therapyfileAns, xDatafileAns, predfileAns, coef_mode=coefmode, overwrite=overwrite) if clear: os.remove(xDatafileTrain) os.remove(xDatafileAns) pass
def mixedmodel(feature_codes, label, predfile='output_pred.csv', trainids=[], testids=[], inputfilename='', userid_=0, coefmode='coef(min)', with_small=False, overwrite=False, train_ratio=0.66, nfolds=10, nrepeats=1, threshold=1000, alpha=1.0, ncores=1, clear=False, testfile=None, online=False, repeat=1, sigma=0.0): dst_dir = '.' shutil.copy(pydream2015.DATA_COMBITHERAPY, join(dst_dir, 'THERAPY_TRAINSET.CSV')) shutil.copy(pydream2015.DATA_COMBITHERAPY_TEST, join(dst_dir, 'THERAPY_TESTSET.CSV')) # shutil.copy(pydream2015.DATA_COMBITHERAPY, # '/data/ui_input/dream/'+str(userid_)+'THERAPY_TRAINSET.CSV') # shutil.copy(pydream2015.DATA_COMBITHERAPY_TEST, # '/data/ui_input/dream/'+str(userid_)+'THERAPY_TESTSET.CSV') with_randomeffect = True model_json = label + '.json' if exists(model_json) and (overwrite == False) and online == False: return xDatafileTrain = label + '_xDataTrain.csv' yDatafileTrain = label + '_yDataTrain.csv' xDatafileTest = label + '_xDataTest.csv' yDatafileTest = label + '_yDataTest.csv' xDatafileAns = label + '_xDataAns.csv' # predfile = label + '_PRED.csv' obsfile = label + '_OBSERV.csv' EXPERIMENT_DATA = pd.read_csv(join(dst_dir, 'THERAPY_TRAINSET.CSV')) features = [] for code in feature_codes: features.append(feature_db[code]) TRAIN_SET = EXPERIMENT_DATA rndeffval2 = calc_randomeffect2(TRAIN_SET) # if model_json == None : model_json = join(dirname(__file__), 'code-with-inputdata', 'STEP3_FULL.json') # model_json = '/data/platform_scripts/models/dream2015/code-with-inputdata/STEP3_FULL.json' model3 = json.load(open(model_json)) coefmin = model3['data']['coef(min)'] coefnames = [] for coef in coefmin.keys(): coefnames += coef.split('*') coefnames = set(coefnames) coefnames = list(coefnames) linear_features = coefnames all_coefnames_i = coefmin.keys() XTEST = collect_feature(features, with_small=with_small, mode='user') fea_set = set(linear_features) xtest_set = set(XTEST.columns.values.tolist()) fea_set = fea_set.intersection(xtest_set) linear_features = list(fea_set) XTEST = expand_poly2(XTEST, cols=linear_features) set1 = set(all_coefnames_i) set2 = set(XTEST.columns.values.tolist()) set1 = set1.intersection(set2) XTEST = XTEST[list(set1)] TEST_SET = pd.read_csv(inputfilename) PREDICTION = TEST_SET.copy() PREDICTION['SYNERGY_SCORE'] = np.nan Yprediction = predict(PREDICTION, XTEST, coefmin, rndeffval2, sigma=sigma, repeat=repeat) if len(Yprediction) == 1: PREDICTION['PREDICTION'] = Yprediction[0] PREDICTION[['CELL_LINE', 'COMBINATION_ID', 'PREDICTION']].to_csv(predfile, index=False) else: for i in range(len(Yprediction)): PREDICTION['PREDICTION%d' % i] = Yprediction[i] PREDICTION[['CELL_LINE', 'COMBINATION_ID'] + ['PREDICTION%d' % i for i in range(len(Yprediction))]].to_csv(predfile, index=False)
def mixedmodel(feature_codes, label, trainids=[], testids=[], coefmode='coef(min)', with_small=False, overwrite=False, train_ratio=0.66, nfolds=10, nrepeats=1, threshold=1000, alpha=1.0, ncores=1, clear=False, testfile=None): with_randomeffect = True model_json = label + '.json' if exists(model_json) and (overwrite==False): return xDatafileTrain = label + '_xDataTrain.csv' yDatafileTrain = label + '_yDataTrain.csv' xDatafileTest = label + '_xDataTest.csv' yDatafileTest = label + '_yDataTest.csv' xDatafileAns = label + '_xDataAns.csv' predfile = label + '_PRED.csv' obsfile = label + '_OBSERV.csv' step1data = json.load(open('STEP1.json')) step2data = json.load(open('STEP2.json')) monocoef = step1data['data']['coef(min)'].keys() combicoef = step2data['data']['coef(min)'].keys() EXPERIMENT_DATA = pd.read_csv( 'THERAPY_TRAINSET.CSV' ) features = [] for code in feature_codes: features.append( feature_db[code] ) pass # Prepare training data X = collect_feature(features, with_small=with_small, mode='train') Xcolumns = Set(X.columns.values.tolist()) linear_features = list(Xcolumns.intersection(Set(monocoef))) X = expand_poly2(X, cols=linear_features) # Now, Xcolumns containss quadratic terms. We also need extra codes for # matching 'small feature sets'. Xcolumns = Set(X.columns.values.tolist()) all_coefnames = Set(monocoef + combicoef) all_coefnames_i = all_coefnames.intersection(Xcolumns) X = X[list(all_coefnames_i)] if trainids == []: X.to_csv(xDatafileTrain, index=False) TRAIN_SET = EXPERIMENT_DATA else: X.loc[trainids].to_csv(xDatafileTrain, index=False) TRAIN_SET = EXPERIMENT_DATA.loc[trainids] TRAIN_SET.to_csv(label + '_cv_train.csv', index=False) TEST_SET = EXPERIMENT_DATA.loc[testids] TEST_SET.to_csv(label + '_cv_test.csv', index=False) TEST_SET[['CELL_LINE','COMBINATION_ID','SYNERGY_SCORE']].to_csv( obsfile, index=False) pass rndeffval2 = calc_randomeffect2(TRAIN_SET) # We eliminate random effect from SYNERGY_SCORE from experimental data. # Then, we use the normalized SYNERGY_SCORE in regression step. Y_without_RndEffect = TRAIN_SET.copy() for i in Y_without_RndEffect.index: a_therapy = Y_without_RndEffect.loc[i] Y_without_RndEffect.loc[i, 'SYNERGY_SCORE'] -= estimate_rndeff(\ rndeffval2, a_therapy) pass Y_without_RndEffect[['SYNERGY_SCORE']].to_csv(yDatafileTrain, index=False) # Now, we train the mixed quadratic model with prepared normalized data. model_training(xDatafileTrain, yDatafileTrain, model_json, with_small=with_small, overwrite=overwrite, train_ratio=train_ratio, nfolds=nfolds, nrepeats=nrepeats, threshold=threshold, random_effect=rndeffval2, alpha=alpha,ncores=ncores) model3 = json.load(open(model_json)) coefmin = model3['data']['coef(min)'] # Next, we apply the trained model to evaluate and answer. Evaluation and # answer are crossvalidation and test mode, respectively. if trainids == []: # This is test mode (answering to the test data set). In this mode, we # use all of the data points to train the model. We can not evaluate # this model because we don't know the experimental result. # Collect features for test data set, and then expand it. XTEST = collect_feature(features, with_small=with_small, mode='test') XTEST = expand_poly2(XTEST, cols=linear_features) XTEST = XTEST[list(all_coefnames_i)] # Prepare therapy data for answering. The therapy data provides # categorical variables for computing random effects. TEST_SET = pd.read_csv('THERAPY_TESTSET.CSV') PREDICTION = TEST_SET.copy() PREDICTION['SYNERGY_SCORE'] = np.nan Yprediction = predict(PREDICTION, XTEST, coefmin, rndeffval2) PREDICTION['PREDICTION'] = Yprediction PREDICTION[['CELL_LINE','COMBINATION_ID','PREDICTION']].to_csv( \ predfile, index=False) # In addition, we use training dataset to predict it's SYNERGY_SCORE, # and the compare experimental and predicted SYNERGY_SCORE. With this # result, we can sure that out final anwering result will not have # numerical bugs. PREDICTION2 = TRAIN_SET.copy() PREDICTION2['SYNERGY_SCORE'] = np.nan Yprediction2 = predict(PREDICTION2, X, coefmin, rndeffval2) PREDICTION2['PREDICTION'] = Yprediction2 PREDICTION2[['CELL_LINE','COMBINATION_ID','PREDICTION']].to_csv( \ label+'_PRED_CHECK.csv', index=False) TRAIN_SET[['CELL_LINE','COMBINATION_ID','SYNERGY_SCORE']].to_csv( label+'_OBS_CHECK.csv', index=False) # We can calculate the score, but the score is not used to evaluate the # model. The score is only used to judge if there is numerical problem # or not. scorefile = tempfile.mktemp() ch1scoring_fc.run2a(label+'_OBS_CHECK.csv', label+'_PRED_CHECK.csv', scorefile) model3['data']['score(dream)_check'] = json.load(open(scorefile, 'rb')) json.dump(model3, open(model_json, 'wb'), separators=(',',':'), sort_keys=True, indent=2) else: # This is crossvalidation mode. In this mode, we can evaluate the model. PREDICTION = TEST_SET.copy() PREDICTION['SYNERGY_SCORE'] = np.nan Yprediction = predict(PREDICTION, X.loc[testids], coefmin, rndeffval2) PREDICTION['PREDICTION'] = Yprediction PREDICTION[['CELL_LINE','COMBINATION_ID','PREDICTION']].to_csv(predfile, index=False) scorefile = tempfile.mktemp() ch1scoring_fc.run2a(obsfile, predfile, scorefile) model3['data']['score(dream)'] = json.load(open(scorefile, 'rb')) json.dump(model3, open(model_json, 'wb'), separators=(',',':'), sort_keys=True, indent=2) pass
def mixedmodel(feature_codes, label, predfile = 'output_pred.csv', trainids=[], testids=[], coefmode='coef(min)', with_small=False, overwrite=False, train_ratio=0.66, nfolds=10, nrepeats=1, threshold=1000, alpha=1.0, ncores=1, clear=False, testfile=None, online=False): shutil.copy(pydream2015.DATA_COMBITHERAPY, 'THERAPY_TRAINSET.CSV') shutil.copy(pydream2015.DATA_COMBITHERAPY_TEST, 'THERAPY_TESTSET.CSV') with_randomeffect = True model_json = label + '.json' if exists(model_json) and (overwrite==False) and online==False: return xDatafileTrain = label + '_xDataTrain.csv' yDatafileTrain = label + '_yDataTrain.csv' xDatafileTest = label + '_xDataTest.csv' yDatafileTest = label + '_yDataTest.csv' xDatafileAns = label + '_xDataAns.csv' # predfile = label + '_PRED.csv' obsfile = label + '_OBSERV.csv' EXPERIMENT_DATA = pd.read_csv( 'THERAPY_TRAINSET.CSV' ) features = [] for code in feature_codes: features.append( feature_db[code] ) pass TRAIN_SET = EXPERIMENT_DATA rndeffval2 = calc_randomeffect2(TRAIN_SET) model_json = 'STEP3_FULL.json' model3 = json.load(open(model_json)) coefmin = model3['data']['coef(min)'] coefnames = [] for coef in coefmin.keys(): coefnames += coef.split('*') coefnames = set(coefnames) coefnames = list(coefnames) linear_features = coefnames all_coefnames_i = coefmin.keys() XTEST = collect_feature(features, with_small=with_small, mode='user') XTEST = expand_poly2(XTEST, cols=linear_features) set1 = set(all_coefnames_i) set2 = set(XTEST.columns.values.tolist()) set1 = set1.intersection(set2) XTEST = XTEST[list(set1)] TEST_SET = pd.read_csv('THERAPY_USER.CSV') PREDICTION = TEST_SET.copy() PREDICTION['SYNERGY_SCORE'] = np.nan Yprediction = predict(PREDICTION, XTEST, coefmin, rndeffval2) PREDICTION['PREDICTION'] = Yprediction PREDICTION[['CELL_LINE','COMBINATION_ID','PREDICTION']].to_csv( \ predfile, index=False)