Example #1
0
def test_groupped_test(with_small, overwrite, outputfile='Xdata.csv'):

    if exists(outputfile) and (overwrite == False):
        return

    global best_dir

    modelfile = join(best_dir, 'STEP3_CVRES.json')
    assert exists(modelfile)

    modeldata = json.load(open(modelfile))
    coefdict = modeldata['model']['data']['coef(min)']
    coefs = coefdict.keys()
    coefs.remove('(Intercept)')
    monocoef = []

    for c in coefs:
        words = c.split('*')
        monocoef += words

    print(monocoef)

    sys.path.append(datadir)
    import feature_sets
    codelist = feature_sets.codelist

    features = []
    for code in codelist:
        features.append(feature_db[code])
        pass

    # Prepare training data
    X = collect_feature(features, with_small=with_small, mode='train')

    Xcolumns = Set(X.columns.values.tolist())
    linear_features = list(Xcolumns.intersection(Set(monocoef)))
    X = expand_poly2(X, cols=linear_features)

    # Now, Xcolumns containss quadratic terms. We also need extra codes for
    # matching 'small feature sets'.
    Xcolumns = Set(X.columns.values.tolist())
    #all_coefnames = Set(monocoef + combicoef)
    all_coefnames = Set(coefs)
    all_coefnames_i = all_coefnames.intersection(Xcolumns)
    X = X[list(all_coefnames_i)]

    X.to_csv(outputfile)
Example #2
0
def mixedmodel(feature_codes, label, coefmode='coef(min)', polyexp=False,
        selected_coefs=[], with_small=False, overwrite=False, 
        train_ratio=0.7, nfolds=5, nrepeats=0, threshold=1000,
        with_randomeffect=False, alpha=1.0, ncores=1, clear=False):

    model_json = label + '.json'
    xDatafileTrain = label + '_xData.csv'
    yDatafile = label + '_yData.csv'
    xDatafileAns = label + '_xDataAns.csv'
    predfile = label + '_PRED.csv'
    predfileAns = label + '_PREDL.csv' 

    obsfile = label + '_OBSERV.csv'

    if exists(obsfile) and (overwrite==False):
        return 

    therapyfileTrain = pydream2015.DATA_COMBITHERAPY 
    dfTherapy = pd.read_csv(therapyfileTrain)

    therapyfileAns = pydream2015.DATA_COMBITHERAPY_LEADER

    features = []

    for code in feature_codes:
        features.append( feature_db[code] ) 
        pass

    # training data 준비하기 
    X = collect_feature(features, with_small=with_small, mode='train') 

    if polyexp and with_small: 
        selected_coefs = selected_coefs[0:5]
        pass

    if polyexp: 
        X = expand_poly2(X, cols=selected_coefs)
    else: 
        pass

    X.to_csv(xDatafileTrain, index=False)

    Xans = collect_feature(features, with_small=with_small, mode='leader') 
    
    if polyexp: 
        Xans = expand_poly2(Xans, cols=selected_coefs) 
    else:
        pass

    Xans.to_csv(xDatafileAns, index=False)

    print 'num_samples: %d, num_features: %d' % (X.shape[0], X.shape[1])

    if with_randomeffect:
        random_effect = {}
        random_effect['mean'] = {} 
        random_effect['std'] = {} 
        #therapy_groups = dfTherapy.groupby('CELL_LINE').groups
        therapy_groups = dfTherapy.groupby(SELECTED_RANDOMEFFECT).groups
        for cell in therapy_groups: 
            ids = therapy_groups[cell]
            syn_mean = dfTherapy.loc[ids,'SYNERGY_SCORE'].mean()
            syn_std = dfTherapy.loc[ids,'SYNERGY_SCORE'].std() 
            random_effect['mean'][cell] = syn_mean
            random_effect['std'][cell] = syn_std
            pass

        for i in dfTherapy.index:
            cell = dfTherapy.loc[i, SELECTED_RANDOMEFFECT] 
            dfTherapy.loc[i, 'SYNERGY_SCORE'] -= random_effect['mean'][cell]
            pass

    else:
        random_effect = None 
        pass
    
    y = dfTherapy[['SYNERGY_SCORE']]
    y.to_csv(yDatafile, index=False)

    model_training(xDatafileTrain, yDatafile, model_json, polyexp=False, 
            with_small=with_small, overwrite=overwrite, train_ratio=train_ratio, 
            nfolds=nfolds, nrepeats=nrepeats, threshold=threshold, 
            with_randomeffect=with_randomeffect, random_effect=random_effect,
            alpha=alpha,ncores=ncores)

    # test and scoring with training data set: 
    model_test(model_json, therapyfileTrain, xDatafileTrain, predfile, 
            coef_mode=coefmode, overwrite=overwrite)
    model_scoring(model_json, therapyfileTrain, obsfile, predfile, 
            coef_mode=coefmode)

    # answer with leaderboard data set: 
    model_test(model_json, therapyfileAns, xDatafileAns, predfileAns, coef_mode=coefmode, 
            overwrite=overwrite)

    if clear: 
        os.remove(xDatafileTrain)
        os.remove(xDatafileAns)
        pass
Example #3
0
def mixedmodel(feature_codes,
               label,
               predfile='output_pred.csv',
               trainids=[],
               testids=[],
               inputfilename='',
               userid_=0,
               coefmode='coef(min)',
               with_small=False,
               overwrite=False,
               train_ratio=0.66,
               nfolds=10,
               nrepeats=1,
               threshold=1000,
               alpha=1.0,
               ncores=1,
               clear=False,
               testfile=None,
               online=False,
               repeat=1,
               sigma=0.0):

    dst_dir = '.'

    shutil.copy(pydream2015.DATA_COMBITHERAPY,
                join(dst_dir, 'THERAPY_TRAINSET.CSV'))

    shutil.copy(pydream2015.DATA_COMBITHERAPY_TEST,
                join(dst_dir, 'THERAPY_TESTSET.CSV'))

    # shutil.copy(pydream2015.DATA_COMBITHERAPY,
    #     '/data/ui_input/dream/'+str(userid_)+'THERAPY_TRAINSET.CSV')

    # shutil.copy(pydream2015.DATA_COMBITHERAPY_TEST,
    #     '/data/ui_input/dream/'+str(userid_)+'THERAPY_TESTSET.CSV')

    with_randomeffect = True

    model_json = label + '.json'

    if exists(model_json) and (overwrite == False) and online == False:
        return

    xDatafileTrain = label + '_xDataTrain.csv'
    yDatafileTrain = label + '_yDataTrain.csv'

    xDatafileTest = label + '_xDataTest.csv'
    yDatafileTest = label + '_yDataTest.csv'

    xDatafileAns = label + '_xDataAns.csv'

    # predfile = label + '_PRED.csv'

    obsfile = label + '_OBSERV.csv'

    EXPERIMENT_DATA = pd.read_csv(join(dst_dir, 'THERAPY_TRAINSET.CSV'))

    features = []
    for code in feature_codes:
        features.append(feature_db[code])

    TRAIN_SET = EXPERIMENT_DATA

    rndeffval2 = calc_randomeffect2(TRAIN_SET)

    # if model_json == None :
    model_json = join(dirname(__file__), 'code-with-inputdata',
                      'STEP3_FULL.json')

    # model_json = '/data/platform_scripts/models/dream2015/code-with-inputdata/STEP3_FULL.json'

    model3 = json.load(open(model_json))
    coefmin = model3['data']['coef(min)']

    coefnames = []
    for coef in coefmin.keys():
        coefnames += coef.split('*')

    coefnames = set(coefnames)
    coefnames = list(coefnames)
    linear_features = coefnames
    all_coefnames_i = coefmin.keys()

    XTEST = collect_feature(features, with_small=with_small, mode='user')

    fea_set = set(linear_features)
    xtest_set = set(XTEST.columns.values.tolist())
    fea_set = fea_set.intersection(xtest_set)
    linear_features = list(fea_set)

    XTEST = expand_poly2(XTEST, cols=linear_features)

    set1 = set(all_coefnames_i)
    set2 = set(XTEST.columns.values.tolist())
    set1 = set1.intersection(set2)

    XTEST = XTEST[list(set1)]

    TEST_SET = pd.read_csv(inputfilename)
    PREDICTION = TEST_SET.copy()
    PREDICTION['SYNERGY_SCORE'] = np.nan

    Yprediction = predict(PREDICTION,
                          XTEST,
                          coefmin,
                          rndeffval2,
                          sigma=sigma,
                          repeat=repeat)

    if len(Yprediction) == 1:
        PREDICTION['PREDICTION'] = Yprediction[0]
        PREDICTION[['CELL_LINE', 'COMBINATION_ID',
                    'PREDICTION']].to_csv(predfile, index=False)
    else:
        for i in range(len(Yprediction)):
            PREDICTION['PREDICTION%d' % i] = Yprediction[i]

        PREDICTION[['CELL_LINE', 'COMBINATION_ID'] +
                   ['PREDICTION%d' % i
                    for i in range(len(Yprediction))]].to_csv(predfile,
                                                              index=False)
Example #4
0
def mixedmodel(feature_codes, label, trainids=[], testids=[], coefmode='coef(min)', 
        with_small=False, overwrite=False, train_ratio=0.66, nfolds=10, nrepeats=1, 
        threshold=1000, alpha=1.0, ncores=1, clear=False, testfile=None):

    with_randomeffect = True 
    model_json = label + '.json'

    if exists(model_json) and (overwrite==False):
        return 

    xDatafileTrain = label + '_xDataTrain.csv'
    yDatafileTrain = label + '_yDataTrain.csv'

    xDatafileTest = label + '_xDataTest.csv'
    yDatafileTest = label + '_yDataTest.csv'

    xDatafileAns = label + '_xDataAns.csv'

    predfile = label + '_PRED.csv'
    obsfile = label + '_OBSERV.csv'

    step1data = json.load(open('STEP1.json'))
    step2data = json.load(open('STEP2.json'))
    monocoef = step1data['data']['coef(min)'].keys() 
    combicoef = step2data['data']['coef(min)'].keys() 

    EXPERIMENT_DATA = pd.read_csv( 'THERAPY_TRAINSET.CSV' ) 

    features = []
    for code in feature_codes:
        features.append( feature_db[code] ) 
        pass

    # Prepare training data
    X = collect_feature(features, with_small=with_small, mode='train') 

    Xcolumns = Set(X.columns.values.tolist())
    linear_features = list(Xcolumns.intersection(Set(monocoef))) 
    X = expand_poly2(X, cols=linear_features)

    # Now, Xcolumns containss quadratic terms. We also need extra codes for
    # matching 'small feature sets'.
    Xcolumns = Set(X.columns.values.tolist())
    all_coefnames = Set(monocoef + combicoef)
    all_coefnames_i = all_coefnames.intersection(Xcolumns) 
    X = X[list(all_coefnames_i)]

    if trainids == []: 
        X.to_csv(xDatafileTrain, index=False)
        TRAIN_SET = EXPERIMENT_DATA

    else: 
        X.loc[trainids].to_csv(xDatafileTrain, index=False)
        TRAIN_SET = EXPERIMENT_DATA.loc[trainids]
        TRAIN_SET.to_csv(label + '_cv_train.csv', index=False)

        TEST_SET = EXPERIMENT_DATA.loc[testids]
        TEST_SET.to_csv(label + '_cv_test.csv', index=False)
        TEST_SET[['CELL_LINE','COMBINATION_ID','SYNERGY_SCORE']].to_csv( 
                obsfile, index=False)
        pass

    rndeffval2 = calc_randomeffect2(TRAIN_SET) 

    # We eliminate random effect from SYNERGY_SCORE from experimental data.
    # Then, we use the normalized SYNERGY_SCORE in regression step. 
    Y_without_RndEffect = TRAIN_SET.copy()
    for i in Y_without_RndEffect.index:
        a_therapy = Y_without_RndEffect.loc[i] 
        Y_without_RndEffect.loc[i, 'SYNERGY_SCORE'] -= estimate_rndeff(\
                rndeffval2, a_therapy) 
        pass

    Y_without_RndEffect[['SYNERGY_SCORE']].to_csv(yDatafileTrain, index=False)

    # Now, we train the mixed quadratic model with prepared normalized data.
    model_training(xDatafileTrain, yDatafileTrain, model_json, 
            with_small=with_small, overwrite=overwrite, train_ratio=train_ratio, 
            nfolds=nfolds, nrepeats=nrepeats, threshold=threshold,
            random_effect=rndeffval2, alpha=alpha,ncores=ncores)

    model3 = json.load(open(model_json))
    coefmin = model3['data']['coef(min)']

    # Next, we apply the trained model to evaluate and answer. Evaluation and
    # answer are crossvalidation and test mode, respectively.
    if trainids == []:        
        # This is test mode (answering to the test data set). In this mode, we
        # use all of the data points to train the model. We can not evaluate
        # this model because we don't know the experimental result.

        # Collect features for test data set, and then expand it. 
        XTEST = collect_feature(features, with_small=with_small, mode='test') 
        XTEST = expand_poly2(XTEST, cols=linear_features) 
        XTEST = XTEST[list(all_coefnames_i)]

        # Prepare therapy data for answering. The therapy data provides
        # categorical variables for computing random effects.
        TEST_SET = pd.read_csv('THERAPY_TESTSET.CSV')
        PREDICTION = TEST_SET.copy() 
        PREDICTION['SYNERGY_SCORE'] = np.nan
        Yprediction = predict(PREDICTION, XTEST, coefmin, rndeffval2) 
        PREDICTION['PREDICTION'] = Yprediction
        PREDICTION[['CELL_LINE','COMBINATION_ID','PREDICTION']].to_csv( \
                predfile, index=False)

        # In addition, we use training dataset to predict it's SYNERGY_SCORE,
        # and the compare experimental and predicted SYNERGY_SCORE. With this
        # result, we can sure that out final anwering result will not have
        # numerical bugs.  
        PREDICTION2 = TRAIN_SET.copy() 
        PREDICTION2['SYNERGY_SCORE'] = np.nan
        Yprediction2 = predict(PREDICTION2, X, coefmin, rndeffval2) 
        PREDICTION2['PREDICTION'] = Yprediction2
        PREDICTION2[['CELL_LINE','COMBINATION_ID','PREDICTION']].to_csv( \
                label+'_PRED_CHECK.csv', index=False)
        TRAIN_SET[['CELL_LINE','COMBINATION_ID','SYNERGY_SCORE']].to_csv( 
                label+'_OBS_CHECK.csv', index=False)

        # We can calculate the score, but the score is not used to evaluate the
        # model. The score is only used to judge if there is numerical problem
        # or not. 
        scorefile = tempfile.mktemp() 
        ch1scoring_fc.run2a(label+'_OBS_CHECK.csv', label+'_PRED_CHECK.csv', 
                scorefile) 
        model3['data']['score(dream)_check'] = json.load(open(scorefile, 'rb'))
        json.dump(model3, open(model_json, 'wb'), separators=(',',':'), 
                sort_keys=True, indent=2)

    else:
        # This is crossvalidation mode. In this mode, we can evaluate the model.
        PREDICTION = TEST_SET.copy() 
        PREDICTION['SYNERGY_SCORE'] = np.nan
        Yprediction = predict(PREDICTION, X.loc[testids], coefmin, rndeffval2) 
        PREDICTION['PREDICTION'] = Yprediction
        PREDICTION[['CELL_LINE','COMBINATION_ID','PREDICTION']].to_csv(predfile,
                index=False)

        scorefile = tempfile.mktemp() 
        ch1scoring_fc.run2a(obsfile, predfile, scorefile) 
        model3['data']['score(dream)'] = json.load(open(scorefile, 'rb'))
        json.dump(model3, open(model_json, 'wb'), separators=(',',':'), 
                sort_keys=True, indent=2)

        pass
def mixedmodel(feature_codes, label, predfile = 'output_pred.csv', trainids=[], testids=[], coefmode='coef(min)', 
        with_small=False, overwrite=False, train_ratio=0.66, nfolds=10, nrepeats=1, 
        threshold=1000, alpha=1.0, ncores=1, clear=False, testfile=None,
        online=False):

    shutil.copy(pydream2015.DATA_COMBITHERAPY, 'THERAPY_TRAINSET.CSV') 
    shutil.copy(pydream2015.DATA_COMBITHERAPY_TEST, 'THERAPY_TESTSET.CSV') 

    with_randomeffect = True 
    model_json = label + '.json'

    if exists(model_json) and (overwrite==False) and online==False:
        return 

    xDatafileTrain = label + '_xDataTrain.csv'
    yDatafileTrain = label + '_yDataTrain.csv'

    xDatafileTest = label + '_xDataTest.csv'
    yDatafileTest = label + '_yDataTest.csv'

    xDatafileAns = label + '_xDataAns.csv'

    # predfile = label + '_PRED.csv'

    obsfile = label + '_OBSERV.csv'

    EXPERIMENT_DATA = pd.read_csv( 'THERAPY_TRAINSET.CSV' ) 

    features = []
    for code in feature_codes:
        features.append( feature_db[code] ) 
        pass

    TRAIN_SET = EXPERIMENT_DATA

    rndeffval2 = calc_randomeffect2(TRAIN_SET) 
    model_json = 'STEP3_FULL.json'
    model3 = json.load(open(model_json))
    coefmin = model3['data']['coef(min)']

    coefnames = [] 
    for coef in coefmin.keys(): 
        coefnames += coef.split('*')
   
    coefnames = set(coefnames)
    coefnames = list(coefnames) 
    linear_features = coefnames 
    all_coefnames_i = coefmin.keys()

    XTEST = collect_feature(features, with_small=with_small, mode='user') 
    XTEST = expand_poly2(XTEST, cols=linear_features) 

    set1 = set(all_coefnames_i)
    set2 = set(XTEST.columns.values.tolist())
    set1 = set1.intersection(set2) 

    XTEST = XTEST[list(set1)]

    TEST_SET = pd.read_csv('THERAPY_USER.CSV')
    PREDICTION = TEST_SET.copy() 
    PREDICTION['SYNERGY_SCORE'] = np.nan
    Yprediction = predict(PREDICTION, XTEST, coefmin, rndeffval2) 
    PREDICTION['PREDICTION'] = Yprediction
    PREDICTION[['CELL_LINE','COMBINATION_ID','PREDICTION']].to_csv( \
            predfile, index=False)