def make_Y_obs(kinds, target_dilution=None, imputer=None):
    if target_dilution == 'gold':
        # For actual testing, use 1/1000 dilution for intensity and
        # high dilution for everything else.
        Y, imputer = make_Y_obs(kinds, target_dilution='high', imputer=imputer)
        intensity, imputer = make_Y_obs(kinds,
                                        target_dilution=-3,
                                        imputer=imputer)
        Y['mean_std'][:, 0] = intensity['mean_std'][:, 0]
        Y['mean_std'][:, 21] = intensity['mean_std'][:, 21]
        for i in range(1, 50):
            Y['subject'][i][:, 0] = intensity['subject'][i][:, 0]
        return Y, imputer
    if type(kinds) is str:
        kinds = [kinds]
    if imputer in [None, 'median']:
        imputer = Imputer(missing_values=np.nan, strategy='median', axis=0)
    Y = {}
    for kind in kinds:
        assert kind in ['training','leaderboard','testset'], \
            "No such kind %s" % kind
        if kind == 'leaderboard':
            loading.format_leaderboard_perceptual_data()
        _, perceptual_data = loading.load_perceptual_data(kind)
        print("Getting basic perceptual data...")
        matrices = get_perceptual_matrices(perceptual_data,
                                           target_dilution=target_dilution)
        print("Flattening into vectors...")
        v_mean = get_perceptual_vectors(matrices,
                                        imputer=imputer,
                                        statistic='mean',
                                        target_dilution=target_dilution)
        v_std = get_perceptual_vectors(matrices,
                                       imputer=imputer,
                                       statistic='std',
                                       target_dilution=target_dilution)
        v_subject = get_perceptual_vectors(matrices,
                                           imputer=imputer,
                                           statistic=None,
                                           target_dilution=target_dilution)
        print("Assembling into matrices...")
        Y[kind] = build_Y_obs(v_mean, v_std, v_subject)

    print("Combining Y matrices...")
    Y_ = {'subject': {}}
    Y_['mean_std'] = np.vstack([Y[kind]['mean_std'] for kind in
                                ['training','leaderboard','testset'] \
                                if kind in kinds])
    for subject in range(1, 50):
        Y_['subject'][subject] = np.ma.vstack([Y[kind]['subject'][subject] for kind in
                                ['training','leaderboard','testset'] \
                                if kind in kinds])
    print("The Y['mean_std'] matrix now has shape (%dx%d) " % Y_['mean_std'].shape +\
          "molecules by 2 x perceptual descriptors")
    print("The Y['subject'] dict now has %d matrices of shape (%dx%d) " % \
          (len(Y_['subject']),Y_['subject'][1].shape[0],Y_['subject'][1].shape[1]) +\
          "molecules by perceptual descriptors, one for each subject")
    return Y_, imputer
def make_Y_obs(kinds, target_dilution=None, imputer=None, quiet=False):
    if target_dilution == 'gold':
        # For actual testing, use 1/1000 dilution for intensity and
        # high dilution for everything else.  
        Y,imputer = make_Y_obs(kinds,target_dilution='high',imputer=imputer,quiet=True)
        intensity,imputer = make_Y_obs(kinds,target_dilution=-3,imputer=imputer)
        Y['mean_std'][:,0] = intensity['mean_std'][:,0]
        Y['mean_std'][:,21] = intensity['mean_std'][:,21]
        for i in range(1,50):
            Y['subject'][i][:,0] = intensity['subject'][i][:,0]
        return Y,imputer
    if type(kinds) is str:
        kinds = [kinds]
    if imputer in [None,'median']:
        imputer = Imputer(missing_values=np.nan,strategy='median',axis=0)
    Y = {}
    for kind in kinds:
        assert kind in KINDS, "No such kind %s" % kind
        if kind == 'leaderboard':
            loading.format_leaderboard_perceptual_data()
        if kind == 'testset':
            loading.format_testset_perceptual_data()
        _, perceptual_data = loading.load_perceptual_data(kind)
        #print("Getting basic perceptual data...")
        matrices = get_perceptual_matrices(perceptual_data,
                                            target_dilution=target_dilution)
        #print("Flattening into vectors...")
        v_mean = get_perceptual_vectors(matrices, imputer=imputer, 
                                        statistic='mean', 
                                        target_dilution=target_dilution)
        v_std = get_perceptual_vectors(matrices, imputer=imputer, 
                                        statistic='std', 
                                        target_dilution=target_dilution)
        v_subject = get_perceptual_vectors(matrices, imputer=imputer, 
                                           statistic=None, 
                                           target_dilution=target_dilution)
        #print("Assembling into matrices...")
        Y[kind] = build_Y_obs(v_mean,v_std,v_subject)

    #print("Combining Y matrices...")
    Y_ = {'subject':{}}
    Y_['mean_std'] = np.vstack([Y[kind]['mean_std'] for kind in KINDS \
                                if kind in kinds])
    for subject in range(1,50):
        Y_['subject'][subject] = np.ma.vstack([Y[kind]['subject'][subject] for kind in 
                                KINDS if kind in kinds])
    if not quiet:
        print("The Y['mean_std'] matrix now has shape (%dx%d) " % Y_['mean_std'].shape +\
              "molecules by 2 x perceptual descriptors")
        print("The Y['subject'] dict now has %d matrices of shape (%dx%d) " % \
              (len(Y_['subject']),Y_['subject'][1].shape[0],Y_['subject'][1].shape[1]) +\
              "molecules by perceptual descriptors, one for each subject")
    return Y_,imputer
Exemple #3
0
from sklearn.cross_validation import ShuffleSplit, cross_val_score
from sklearn.grid_search import GridSearchCV

gerkin_path = os.path.dirname(os.path.abspath(__file__))
opc_python_path = os.path.dirname(gerkin_path)
root_path = os.path.dirname(opc_python_path)
sys.path.append(root_path)

import dream
from opc_python.utils import loading
from opc_python.utils import scoring

# Load the perceptual descriptors data.
perceptual_headers, perceptual_obs_data = loading.load_perceptual_data(
    'training')
loading.format_leaderboard_perceptual_data()
# Show the perceptual metadata types and perceptual descriptor names.
print(perceptual_headers)

# Show the metadata and perceptual descriptor values for the first compound.
print(perceptual_obs_data[1])

num_descriptors = len(perceptual_headers[6:])
assert num_descriptors == dream.NUM_DESCRIPTORS
num_subjects = dream.NUM_SUBJECTS
print(
    'There are %d different perceptual descriptors and %d different subjects' %
    (num_descriptors, num_subjects))

# Load the molecular descriptors data.
molecular_headers, molecular_data = loading.load_molecular_data()
from sklearn.ensemble import RandomForestRegressor,ExtraTreesRegressor
from sklearn.cross_validation import ShuffleSplit,cross_val_score
from sklearn.grid_search import GridSearchCV

gerkin_path = os.path.dirname(os.path.abspath(__file__))
opc_python_path  = os.path.dirname(gerkin_path)
root_path = os.path.dirname(opc_python_path)
sys.path.append(root_path)

import dream
from opc_python.utils import loading
from opc_python.utils import scoring

# Load the perceptual descriptors data.  
perceptual_headers, perceptual_obs_data = loading.load_perceptual_data('training')
loading.format_leaderboard_perceptual_data()
# Show the perceptual metadata types and perceptual descriptor names.
print(perceptual_headers)

# Show the metadata and perceptual descriptor values for the first compound.
print(perceptual_obs_data[1])

num_descriptors = len(perceptual_headers[6:])
assert num_descriptors == dream.NUM_DESCRIPTORS
num_subjects = dream.NUM_SUBJECTS
print('There are %d different perceptual descriptors and %d different subjects' % (num_descriptors,num_subjects))

# Load the molecular descriptors data.  
molecular_headers, molecular_data = loading.load_molecular_data()
print("First ten molecular descriptor types are %s" % molecular_headers[:10])
print("First ten descriptor values for the first compound are %s" % molecular_data[0][:10])