Ejemplo n.º 1
0
def prepDataSet(csv_filename, feature_set=None, dataset_name='generic dataset',
        ddg_cutoff=0.0, truncate=False):
    '''
    prepares a data set object from a CSV file, under the conventions of this project:
    - the CSV is indexed by PDBID and residue number (columns 0,1)
    - the last column contains label-related data, mostly ddG values of residues.
    - all other columns are feature columns.

    The function reads the columns into a TreeDict structure, such that each component
    (normalized feature data, labels, PDB identifiers, columns used) is accessible as 
    an attribute.

    ``dataset_name`` is optional, giving the TreeDict a name.
    Optional argument ``features`` directs the function which features to select from 
    the table. By default, all features are selected.
    '''
    
    dataset = TreeDict(dataset_name)
    dataset.csv_filename = os.path.abspath(csv_filename)
    dataset.is_bound = (csv_filename.find('unbound') == -1)
    dataset._df = cached_csv_df(csv_filename, index_col=[0,1],
            true_values=['True'],
            false_values=['False'],
            )
    
    if truncate:
        dataset._df = dataset._df[:DEBUG_DATASET_SIZE]
    
    if feature_set is None:
        cols = dataset._df.columns[:-1]
        dataset.feature_set = FeatureSet(cols, cols)
    else:
        dataset.feature_set = feature_set
    
    all_feature_data_df = dataset._df.ix[:,dataset.feature_set.all_features]
    
    dataset.feature_data_df = all_feature_data_df.ix[:,dataset.feature_set.features]
    #dataset.X = dataset.feature_data_df.values 
    dataset.X = sklearn.preprocessing.scale(
                    dataset.feature_data_df.values.astype(float))
    
    dataset.label_data_df = dataset._df.ix[:,-1]
    dataset.ddg_cutoff = ddg_cutoff
    dataset.y = dataset.label_data_df.values > dataset.ddg_cutoff
    
    # sanity checks
    assert dataset.X.shape[0] == len(dataset.y)
    
    dataset.pdbs = dataset.feature_data_df.index.get_level_values(0)
    
    return dataset