Exemple #1
0
def subject_regularize(rfcs,
                       X_int,
                       X_other,
                       Y,
                       oob=False,
                       regularize=[0.75, 0.3, 0.65]):
    if len(regularize) == 1:
        regularize = regularize * 3
    observed_ = []
    predicted_ = []
    for subject in range(1, 50):
        observed = Y['subject'][subject]
        rfc = rfcs[1][subject]
        if oob:
            predicted = rfc.oob_prediction_
        else:
            predicted = rfc.predict(X_other)
            predicted_int = rfc.predict(X_int)
            predicted[:, 0] = predicted_int[:, 0]
        observed_.append(observed)
        predicted_.append(predicted)
    predicted = np.dstack(predicted_)
    observed = np.ma.dstack(observed_)
    predicted_mean = np.mean(predicted, axis=2, keepdims=True)
    predicted_std = np.std(predicted, axis=2, keepdims=True)
    predicted_mean_std = np.hstack((predicted_mean, predicted_std)).squeeze()
    predicted_int = regularize[0]*(predicted_mean)\
                  + (1-regularize[0])*predicted
    predicted_ple = regularize[1]*(predicted_mean)\
                  + (1-regularize[1])*predicted
    predicted_dec = regularize[2]*(predicted_mean)\
                  + (1-regularize[2])*predicted
    predicted = regularize[0]*(predicted_mean)\
              + (1-regularize[0])*predicted
    r_int = scoring.r('int', predicted_int, observed)
    r_ple = scoring.r('ple', predicted_ple, observed)
    r_dec = scoring.r('dec', predicted_dec, observed)
    score1_ = scoring.score(predicted, observed, n_subjects=49)
    score1 = scoring.rs2score(r_int, r_ple, r_dec)
    #print(score1_,score1)
    print("For subchallenge %d, score = %.3f (%.3f,%.3f,%.3f)"\
          % (1,score1,r_int,r_ple,r_dec))
    score2 = scoring.score2(predicted_mean_std, Y['mean_std'])
    r_int_mean = scoring.r2('int', 'mean', predicted_mean_std, Y['mean_std'])
    r_ple_mean = scoring.r2('ple', 'mean', predicted_mean_std, Y['mean_std'])
    r_dec_mean = scoring.r2('dec', 'mean', predicted_mean_std, Y['mean_std'])
    r_int_std = scoring.r2('int', 'std', predicted_mean_std, Y['mean_std'])
    r_ple_std = scoring.r2('ple', 'std', predicted_mean_std, Y['mean_std'])
    r_dec_std = scoring.r2('dec', 'std', predicted_mean_std, Y['mean_std'])
    print("For subchallenge %d, score = %.2f (%.2f,%.2f,%.2f,%.2f,%.2f,%.2f)"%\
          (2,score2,r_int_mean,r_ple_mean,r_dec_mean,
          r_int_std,r_ple_std,r_dec_std))
    return (r_int, r_ple, r_dec, r_int_mean, r_ple_mean, r_dec_mean, r_int_std,
            r_ple_std, r_dec_std)
def rfc_(X_train,
         Y_train,
         X_test_int,
         X_test_other,
         Y_test,
         max_features=1500,
         n_estimators=1000,
         max_depth=None,
         min_samples_leaf=1):
    print(max_features)

    def rfc_maker():
        return RandomForestRegressor(max_features=max_features,
                                     n_estimators=n_estimators,
                                     max_depth=max_depth,
                                     min_samples_leaf=min_samples_leaf,
                                     n_jobs=-1,
                                     oob_score=True,
                                     random_state=0)

    rfc = rfc_maker()
    rfc.fit(X_train, Y_train)
    scores = {}
    for phase, X, Y in [('train', X_train, Y_train),
                        ('test', (X_test_int, X_test_other), Y_test)]:
        if phase == 'train':
            predicted = rfc.oob_prediction_
        else:
            predicted = rfc.predict(X[1])
            predicted_int = rfc.predict(X[0])
            predicted[:, 0] = predicted_int[:, 0]
            predicted[:, 21] = predicted_int[:, 21]
        observed = Y
        score = scoring.score2(predicted, observed)
        r_int = scoring.r2('int', 'mean', predicted, observed)
        r_ple = scoring.r2('ple', 'mean', predicted, observed)
        r_dec = scoring.r2('dec', 'mean', predicted, observed)
        r_int_sig = scoring.r2('int', 'sigma', predicted, observed)
        r_ple_sig = scoring.r2('ple', 'sigma', predicted, observed)
        r_dec_sig = scoring.r2('dec', 'sigma', predicted, observed)
        print("For subchallenge 2, %s phase, score = %.2f (%.2f,%.2f,%.2f,%.2f,%.2f,%.2f)" \
                % (phase,score,r_int,r_ple,r_dec,r_int_sig,r_ple_sig,r_dec_sig))
        scores[phase] = (score, r_int, r_ple, r_dec, r_int_sig, r_ple_sig,
                         r_dec_sig)

    return rfc, scores['train'], scores['test']
def subject_regularize(rfcs,X_int,X_other,Y,oob=False,regularize=[0.75,0.3,0.65]):
    if len(regularize)==1:
        regularize = regularize*3
    observed_ = []
    predicted_ = []
    for subject in range(1,50):
        observed = Y['subject'][subject]
        rfc = rfcs[1][subject]
        if oob:
            predicted = rfc.oob_prediction_
        else:
            predicted = rfc.predict(X_other)
            predicted_int = rfc.predict(X_int)
            predicted[:,0] = predicted_int[:,0]
        observed_.append(observed)
        predicted_.append(predicted)
    predicted = np.dstack(predicted_)
    observed = np.ma.dstack(observed_)
    predicted_mean = np.mean(predicted,axis=2,keepdims=True)
    predicted_std = np.std(predicted,axis=2,keepdims=True)
    predicted_mean_std = np.hstack((predicted_mean,predicted_std)).squeeze()
    predicted_int = regularize[0]*(predicted_mean) + (1-regularize[0])*predicted
    predicted_ple = regularize[1]*(predicted_mean) + (1-regularize[1])*predicted
    predicted_dec = regularize[2]*(predicted_mean) + (1-regularize[2])*predicted
    predicted = regularize[0]*(predicted_mean) + (1-regularize[0])*predicted
    r_int = scoring.r('int',predicted_int,observed)
    r_ple = scoring.r('ple',predicted_ple,observed)
    r_dec = scoring.r('dec',predicted_dec,observed)
    score1_ = scoring.score(predicted,observed,n_subjects=49)
    score1 = scoring.rs2score(r_int,r_ple,r_dec)
    #print(score1_,score1)
    print("For subchallenge %d, score = %.3f (%.3f,%.3f,%.3f)" % (1,score1,r_int,r_ple,r_dec))
    score2 = scoring.score2(predicted_mean_std,Y['mean_std'])
    r_int_mean = scoring.r2('int','mean',predicted_mean_std,Y['mean_std'])
    r_ple_mean = scoring.r2('ple','mean',predicted_mean_std,Y['mean_std'])
    r_dec_mean = scoring.r2('dec','mean',predicted_mean_std,Y['mean_std'])
    r_int_sigma = scoring.r2('int','sigma',predicted_mean_std,Y['mean_std'])
    r_ple_sigma = scoring.r2('ple','sigma',predicted_mean_std,Y['mean_std'])
    r_dec_sigma = scoring.r2('dec','sigma',predicted_mean_std,Y['mean_std'])
    print("For subchallenge %d, score = %.2f (%.2f,%.2f,%.2f,%.2f,%.2f,%.2f)" % \
         (2,score2,r_int_mean,r_ple_mean,r_dec_mean,r_int_sigma,r_ple_sigma,r_dec_sigma))
    return (r_int,r_ple,r_dec,r_int_mean,r_ple_mean,r_dec_mean,r_int_sigma,r_ple_sigma,r_dec_sigma)
def rfc_(X_train,Y_train,X_test_int,X_test_other,Y_test,
         max_features=1500,n_estimators=1000,max_depth=None,min_samples_leaf=1):
    print(max_features)
    def rfc_maker():
        return RandomForestRegressor(max_features=max_features,
                                     n_estimators=n_estimators,
                                     max_depth=max_depth,
                                     min_samples_leaf=min_samples_leaf,
                                     n_jobs=-1,
                                     oob_score=True,
                                     random_state=0)
        
    rfc = rfc_maker()
    rfc.fit(X_train,Y_train)
    scores = {}
    for phase,X,Y in [('train',X_train,Y_train),('test',(X_test_int,X_test_other),Y_test)]:
        if phase == 'train':
            predicted = rfc.oob_prediction_
        else:
            predicted = rfc.predict(X[1])
            predicted_int = rfc.predict(X[0])
            predicted[:,0] = predicted_int[:,0]
            predicted[:,21] = predicted_int[:,21]
        observed = Y
        score = scoring.score2(predicted,observed)
        r_int = scoring.r2('int','mean',predicted,observed)
        r_ple = scoring.r2('ple','mean',predicted,observed)
        r_dec = scoring.r2('dec','mean',predicted,observed)
        r_int_sig = scoring.r2('int','sigma',predicted,observed)
        r_ple_sig = scoring.r2('ple','sigma',predicted,observed)
        r_dec_sig = scoring.r2('dec','sigma',predicted,observed)
        print("For subchallenge 2, %s phase, score = %.2f (%.2f,%.2f,%.2f,%.2f,%.2f,%.2f)" \
                % (phase,score,r_int,r_ple,r_dec,r_int_sig,r_ple_sig,r_dec_sig))
        scores[phase] = (score,r_int,r_ple,r_dec,r_int_sig,r_ple_sig,r_dec_sig)

    return rfc,scores['train'],scores['test']
def rfc_final(X,Y_imp,Y_mask,
              max_features,min_samples_leaf,max_depth,et,use_mask,trans_weight,
              trans_params,X_test_int=None,X_test_other=None,Y_test=None,n_estimators=100,seed=0,quiet=False):
    
    if X_test_int is None:
        X_test_int = X
    if X_test_other is None:
        X_test_other = X
    if Y_test is None:
        Y_test = Y_mask


    def rfc_maker(n_estimators=n_estimators,max_features=max_features,
                  min_samples_leaf=min_samples_leaf,max_depth=max_depth,et=False):
        if not et: 
            kls = RandomForestRegressor
            kwargs = {'oob_score':False}
        else:
            kls = ExtraTreesRegressor
            kwargs = {}

        return kls(n_estimators=n_estimators, max_features=max_features,
                   min_samples_leaf=min_samples_leaf, max_depth=max_depth,
                   n_jobs=-1, random_state=seed, **kwargs)
        
    rfcs = {}
    for col in range(42):
        prog(col,42)
        rfcs[col] = rfc_maker(n_estimators=n_estimators,
                                max_features=max_features[col],
                                min_samples_leaf=min_samples_leaf[col],
                                max_depth=max_depth[col],
                                et=et[col])

        if use_mask[col]:
            rfcs[col].fit(X,Y_mask[:,col])
        else:
            rfcs[col].fit(X,Y_imp[:,col])
    
    predicted = np.zeros((X_test_int.shape[0],42))
    for col in range(42):
        if et[col] or not np.array_equal(X,X_test_int):
            # Possibly check in-sample fit because there isn't any alternative.  
            if col in [0,21]:
                predicted[:,col] = rfcs[col].predict(X_test_int)
            else:
                predicted[:,col] = rfcs[col].predict(X_test_other)
        else:
            try:
                predicted[:,col] = rfcs[col].oob_prediction_
            except AttributeError:
                if col in [0,21]:
                    predicted[:,col] = rfcs[col].predict(X_test_int)
                else:
                    predicted[:,col] = rfcs[col].predict(X_test_other)

    def f_transform(x, k0, k1):
            return 100*(k0*(x/100)**(k1*0.5) - k0*(x/100)**(k1*2))

    for col in range(21):
        tw = trans_weight[col]
        k0,k1 = trans_params[col]
        p_m = predicted[:,col]
        p_s = predicted[:,col+21]
        predicted[:,col+21] = tw*f_transform(p_m,k0,k1) + (1-tw)*p_s
    
    observed = Y_test
    score = scoring.score2(predicted,observed)
    rs = {}
    for kind in ['int','ple','dec']:
        rs[kind] = {}
        for moment in ['mean','sigma']:
            rs[kind][moment] = scoring.r2(kind,moment,predicted,observed)
    
    if not quiet:
        print("For subchallenge 2:")
        print("\tScore = %.2f" % score)
        for kind in ['int','ple','dec']:
            for moment in ['mean','sigma']: 
                print("\t%s_%s = %.3f" % (kind,moment,rs[kind][moment]))
        
    return (rfcs,score,rs)
def rfc_final(X,Y_imp,Y_mask,
              max_features,min_samples_leaf,max_depth,et,use_mask,
              Y_test=None,n_estimators=100,seed=0):
    
    if Y_test is None:
        Y_test = Y_mask
    def rfc_maker(n_estimators=n_estimators,max_features=max_features,
                  min_samples_leaf=min_samples_leaf,max_depth=max_depth,et=False):
        if not et: 
            return RandomForestRegressor(n_estimators=n_estimators,
                                     max_features=max_features,
                                     min_samples_leaf=min_samples_leaf,
                                     max_depth=max_depth,
                                     oob_score=True,
                                     n_jobs=-1,random_state=seed)
        else:
            return ExtraTreesRegressor(n_estimators=n_estimators,
                                max_features=max_features,
                                min_samples_leaf=min_samples_leaf,
                                max_depth=max_depth,
                                n_jobs=-1,random_state=seed)
        
    rfcs = {}
    for kind in ['int','ple','dec']:
        rfcs[kind] = {} 
        for moment in ['mean','sigma']:
            rfcs[kind][moment] = rfc_maker(n_estimators=n_estimators,
                                max_features=max_features[kind][moment],
                                min_samples_leaf=min_samples_leaf[kind][moment],
                                max_depth=max_depth[kind][moment],
                                et=et[kind][moment])

    for kind in ['int','ple','dec']:
        for moment in ['mean','sigma']:
            if use_mask[kind][moment]:
                rfcs[kind][moment].fit(X,Y_mask)
            else:
                rfcs[kind][moment].fit(X,Y_imp)
    
    predictions = {}
    for kind in ['int','ple','dec']:
        predictions[kind] = {}
        for moment in ['mean','sigma']:
            if et[kind][moment]:
                # Check in-sample fit because there isn't any alternative.  
                predictions[kind][moment] = rfcs[kind][moment].predict(X)
            else:
                predictions[kind][moment] = rfcs[kind][moment].oob_prediction_
    predicted = predictions['int']['mean'].copy()
    for i,moment in enumerate(['mean','sigma']):
        predicted[:,(0+21*i)] = predictions['int'][moment][:,(0+21*i)]
        predicted[:,(1+21*i)] = predictions['ple'][moment][:,(1+21*i)]
        predicted[:,(2+21*i):(21+21*i)] = predictions['dec'][moment][:,(2+21*i):(21+21*i)]

    observed = Y_test
    score = scoring.score2(predicted,observed)
    rs = {}
    predictions = {}
    for kind in ['int','ple','dec']:
        rs[kind] = {}
        for moment in ['mean','sigma']:
            rs[kind][moment] = scoring.r2(kind,moment,predicted,observed)
    rs['int']['trans'] = scoring.r2(None,None,f_int(predicted[:,0]),observed[:,0])

    print("For subchallenge 2:")
    print("\tScore = %.2f" % score)
    for kind in ['int','ple','dec']:
        for moment in ['mean','sigma']: 
            print("\t%s_%s = %.3f" % (kind,moment,rs[kind][moment]))
    
    return (rfcs,score,rs)
def rfc_final(X,
              Y_imp,
              Y_mask,
              max_features,
              min_samples_leaf,
              max_depth,
              et,
              use_mask,
              Y_test=None,
              n_estimators=100,
              seed=0):

    if Y_test is None:
        Y_test = Y_mask

    def rfc_maker(n_estimators=n_estimators,
                  max_features=max_features,
                  min_samples_leaf=min_samples_leaf,
                  max_depth=max_depth,
                  et=False):
        if not et:
            return RandomForestRegressor(n_estimators=n_estimators,
                                         max_features=max_features,
                                         min_samples_leaf=min_samples_leaf,
                                         max_depth=max_depth,
                                         oob_score=True,
                                         n_jobs=-1,
                                         random_state=seed)
        else:
            return ExtraTreesRegressor(n_estimators=n_estimators,
                                       max_features=max_features,
                                       min_samples_leaf=min_samples_leaf,
                                       max_depth=max_depth,
                                       n_jobs=-1,
                                       random_state=seed)

    rfcs = {}
    for kind in ['int', 'ple', 'dec']:
        rfcs[kind] = {}
        for moment in ['mean', 'sigma']:
            rfcs[kind][moment] = rfc_maker(
                n_estimators=n_estimators,
                max_features=max_features[kind][moment],
                min_samples_leaf=min_samples_leaf[kind][moment],
                max_depth=max_depth[kind][moment],
                et=et[kind][moment])

    for kind in ['int', 'ple', 'dec']:
        for moment in ['mean', 'sigma']:
            if use_mask[kind][moment]:
                rfcs[kind][moment].fit(X, Y_mask)
            else:
                rfcs[kind][moment].fit(X, Y_imp)

    predictions = {}
    for kind in ['int', 'ple', 'dec']:
        predictions[kind] = {}
        for moment in ['mean', 'sigma']:
            if et[kind][moment]:
                # Check in-sample fit because there isn't any alternative.
                predictions[kind][moment] = rfcs[kind][moment].predict(X)
            else:
                predictions[kind][moment] = rfcs[kind][moment].oob_prediction_
    predicted = predictions['int']['mean'].copy()
    for i, moment in enumerate(['mean', 'sigma']):
        predicted[:, (0 + 21 * i)] = predictions['int'][moment][:,
                                                                (0 + 21 * i)]
        predicted[:, (1 + 21 * i)] = predictions['ple'][moment][:,
                                                                (1 + 21 * i)]
        predicted[:,
                  (2 + 21 * i):(21 + 21 * i)] = predictions['dec'][moment][:, (
                      2 + 21 * i):(21 + 21 * i)]

    observed = Y_test
    score = scoring.score2(predicted, observed)
    rs = {}
    predictions = {}
    for kind in ['int', 'ple', 'dec']:
        rs[kind] = {}
        for moment in ['mean', 'sigma']:
            rs[kind][moment] = scoring.r2(kind, moment, predicted, observed)
    rs['int']['trans'] = scoring.r2(None, None, f_int(predicted[:, 0]),
                                    observed[:, 0])

    print("For subchallenge 2:")
    print("\tScore = %.2f" % score)
    for kind in ['int', 'ple', 'dec']:
        for moment in ['mean', 'sigma']:
            print("\t%s_%s = %.3f" % (kind, moment, rs[kind][moment]))

    return (rfcs, score, rs)