Beispiel #1
0
def feature_precisions(A, y, feature_sels, test_size=0.2, ITER_TIMES=50, classifier='xgboost', params=None):
    precisions = defaultdict(list)
    xgb_params = {'silent': 1, 'objective': 'multi:softmax', 'num_class': 10}
    if params is not None and classifier == 'xgboost':
        xgb_params.update(params)
    elif params is None:
        params = {}
    num_round = 50
    for r in range(ITER_TIMES):
        A_train, A_test, y_train, y_test = train_test_split(A, y, test_size=test_size)
        ms = []
        for key, features_selection in feature_sels.items():
            _A_train = select_features(A_train, features_selection)
            _A_test = select_features(A_test, features_selection)
            
            if classifier == 'xgboost':
                _A_train = xgb.DMatrix(_A_train, label=y_train)
                _A_test = xgb.DMatrix(_A_test, label=y_test)
                clf = xgb.train(xgb_params, _A_train, num_round)
            elif classifier == 'svm':
                clf = OneVsOneClassifier(SVC(**params))
                clf.fit(_A_train, y_train)
            h = np.array(clf.predict(_A_test)).astype(int)
            p = accuracy_score(h, y_test)

            precisions[key].append(p)
            ms.append('{:>7s} precision:{:7.2%}'.format(key, p))
        sys.stdout.flush()
        sys.stdout.write('Round {:3d}/{:3d}:{}\r'.format(r+1, ITER_TIMES, '|'.join(ms)))

    precision_info(precisions, ITER_TIMES)
    return precisions
Beispiel #2
0
def get_coords(routes:list, output='dict'):
    # output is either 'df' or 'dict'
    url = make_request_url(routes, verbose=True)
    json = utils.grab(url)
    trains = parse_train_data(json)
    features = ['lat', 'lon']
    trains = utils.select_features(trains, features)

    if output == 'df':
        trains = pd.DataFrame.from_dict(trains, orient='index')
        
    return trains
Beispiel #3
0
def run(data_fn,
        prop_missing=0.,
        max_num_feature=-1,
        feature_selection='random',
        k=10,
        data_dir='_data',
        out_dir='_out'):
    """Run RIDDLE classification interpretation pipeline.

    Arguments:
        data_fn: string
            data file filename
        prop_missing: float
            proportion of feature observations which should be randomly masked;
            values in [0, 1)
        max_num_feature: int
            maximum number of features to use
        feature_selection: string
            feature selection method; values = {'random', 'frequency', 'chi2'}
        k: int
            number of partitions for k-fold cross-validation
        interpret_model: bool
            whether to interpret the trained model for first k-fold partition
        which_half: str
            which half of experiments to do; values = {'first', 'last', 'both'}
        data_dir: string
            directory where data files are located
        cache_dir: string
            directory where cached files (e.g., saved parameters) are located
        out_dir: string
            outer directory where outputs (e.g., results) should be saved
    """
    from keras.models import load_model
    from riddle import emr, feature_importance
    from riddle.models import MLP

    start = time.time()

    base_out_dir = get_base_out_dir(out_dir, 'riddle', data_fn, prop_missing,
                                    max_num_feature, feature_selection)
    recursive_mkdir(base_out_dir)

    # get common data
    x_unvec, y, idx_feat_dict, idx_class_dict, icd9_descript_dict, perm_indices = (
        get_preprocessed_data(data_dir, data_fn, prop_missing=prop_missing))
    num_feature = len(idx_feat_dict)
    num_class = len(idx_class_dict)

    list_sums_D, list_sums_D2, list_sums_contribs = [], [], []

    for k_idx in range(k):
        full_out_dir = '{}/k_idx={}'.format(base_out_dir, k_idx)
        print('\nPartition k = {}'.format(k_idx))
        x_train_unvec, y_train, _, _, x_test_unvec, y_test = emr.get_k_fold_partition(
            x_unvec, y, k_idx=k_idx, k=k, perm_indices=perm_indices)

        if max_num_feature > 0:  # select features and re-encode
            feat_encoding_dict, idx_feat_dict = select_features(
                x_train_unvec,
                y_train,
                idx_feat_dict,
                method=feature_selection,
                num_feature=num_feature,
                max_num_feature=max_num_feature)
            x_test_unvec = subset_reencode_features(x_test_unvec,
                                                    feat_encoding_dict)
            num_feature = max_num_feature

        # interpret
        start = time.time()

        temp_mlp = MLP(num_feature=num_feature, num_class=num_class)
        hdf5_path = full_out_dir + '/model.h5'
        sums_D, sums_D2, sums_contribs, pairs = \
            feature_importance.get_diff_sums(
                hdf5_path,
                x_test_unvec,
                process_x_func=temp_mlp.process_x,
                num_feature=num_feature,
                num_class=num_class)

        with open(full_out_dir + '/sums_D.pkl', 'wb') as f:
            pickle.dump(sums_D, f)
        with open(full_out_dir + '/sums_D2.pkl', 'wb') as f:
            pickle.dump(sums_D2, f)
        with open(full_out_dir + '/sums_contribs.pkl', 'wb') as f:
            pickle.dump(sums_contribs, f)

        list_sums_D.append(sums_D)
        list_sums_D2.append(sums_D2)
        list_sums_contribs.append(sums_contribs)

    def compute_total_sums(list_sums):
        total_sums = list_sums[0]

        for i in range(1, len(list_sums)):
            for j in range(len(total_sums)):
                total_sums[j] = np.add(total_sums[j], list_sums[i][j])

        return total_sums

    total_sums_D = compute_total_sums(list_sums_D)
    total_sums_D2 = compute_total_sums(list_sums_D2)
    total_sums_contribs = compute_total_sums(list_sums_contribs)

    num_sample = len(x_unvec)
    run_interpretation_summary(x_unvec,
                               y,
                               total_sums_D,
                               total_sums_D2,
                               total_sums_contribs,
                               idx_feat_dict=idx_feat_dict,
                               idx_class_dict=idx_class_dict,
                               icd9_descript_dict=icd9_descript_dict,
                               pairs=pairs,
                               num_sample=num_sample,
                               full_out_dir=base_out_dir)

    print('Computed DeepLIFT scores and analysis in {:.4f} seconds'.format(
        time.time() - start))
    print('-' * 72)
    print()
Beispiel #4
0
def run(method, x_unvec, y, idx_feat_dict, num_feature, max_num_feature,
        num_class, max_num_sample, feature_selection, k_idx, k, num_search,
        perm_indices):
    """Run a parameter search for a single k-fold partitions

    Arguments:
        method: string
            name of classification method; values = {'logit', 'random_forest',
            'linear_svm', 'poly_svm', 'rbf_svm', 'gbdt', 'riddle'}
        x_unvec: [[int]]
            feature indices that have not been vectorized; each inner list
            collects the indices of features that are present (binary on)
            for a sample
        y: [int]
            list of class labels as integer indices
        idx_feat_dict: {int: string}
            dictionary mapping feature indices to features
        num_feature: int
            number of features present in the dataset
        max_num_feature: int
            maximum number of features to use
        num_class: int
            number of classes present
        feature_selection: string
            feature selection method; values = {'random', 'frequency', 'chi2'}
        k_idx: int
            index of the k-fold partition to use
        k: int
            number of partitions for k-fold cross-validation
        num_search: int
            number of searches (parameter configurations) to try
        perm_indices: np.ndarray, int
            array of indices representing a permutation of the samples with
            shape (num_sample, )

    Returns:
        best_param: {string: ?}
            dictionary mapping parameter names to the best values found
    """
    print('-' * 72)
    print('Partition k = {}'.format(k_idx))

    x_train_unvec, y_train, x_val_unvec, y_val, _, _ = (
        emr.get_k_fold_partition(x_unvec,
                                 y,
                                 k_idx=k_idx,
                                 k=k,
                                 perm_indices=perm_indices))

    if max_num_feature > 0:  # select features and re-encode
        feat_encoding_dict, _ = select_features(
            x_train_unvec,
            y_train,
            idx_feat_dict,
            method=feature_selection,
            num_feature=num_feature,
            max_num_feature=max_num_feature)
        x_val_unvec = subset_reencode_features(x_val_unvec, feat_encoding_dict)
        num_feature = max_num_feature

    # cap number of validation samples
    if max_num_sample != None and len(x_val_unvec) > max_num_sample:
        x_val_unvec = x_val_unvec[0:max_num_sample]
        y_val = y_val[0:max_num_sample]

    start = time.time()
    if method == 'riddle':
        model_class = MLP
        init_args = {'num_feature': num_feature, 'num_class': num_class}
        param_dist = {
            'num_hidden_layer': 2,  # [1, 2]
            'num_hidden_node': 512,  # [128, 256, 512]
            'activation': ['prelu', 'relu'],
            'dropout': tuning.Uniform(lo=0.2, hi=0.8),
            'learning_rate': tuning.UniformLogSpace(10, lo=-6, hi=-1),
        }
        best_param = tuning.random_search(model_class,
                                          init_args,
                                          param_dist,
                                          x_val_unvec,
                                          y_val,
                                          num_class=num_class,
                                          k=TUNING_K,
                                          num_search=num_search)
    else:  # scikit-learn methods
        x_val = vectorize_features(x_val_unvec, num_feature)

        if method == 'logit':  # logistic regression
            from sklearn.linear_model import LogisticRegression
            estimator = LogisticRegression(multi_class='multinomial',
                                           solver='lbfgs')
            param_dist = {'C': tuning.UniformLogSpace(base=10, lo=-3, hi=3)}
        elif method == 'random_forest':
            from sklearn.ensemble import RandomForestClassifier
            estimator = RandomForestClassifier()
            param_dist = {
                'max_features': ['sqrt', 'log2', None],
                'max_depth': tuning.UniformIntegerLogSpace(base=2, lo=0, hi=7),
                'n_estimators': tuning.UniformIntegerLogSpace(base=2,
                                                              lo=4,
                                                              hi=8)
            }
        elif method == 'linear_svm':
            from sklearn.svm import SVC
            # remark: due to a bug in scikit-learn / libsvm, the sparse 'linear'
            # kernel is much slower than the sparse 'poly' kernel, so we use
            # the 'poly' kernel with degree=1 over the 'linear' kernel
            estimator = SVC(kernel='poly',
                            degree=1,
                            coef0=0.,
                            gamma=1.,
                            probability=True,
                            cache_size=1000)
            param_dist = {'C': tuning.UniformLogSpace(base=10, lo=-2, hi=1)}
        elif method == 'poly_svm':
            from sklearn.svm import SVC
            estimator = SVC(kernel='poly', probability=True, cache_size=1000)
            param_dist = {
                'C': tuning.UniformLogSpace(base=10, lo=-2, hi=1),
                'degree': [2, 3, 4],
                'gamma': tuning.UniformLogSpace(base=10, lo=-5, hi=1)
            }
        elif method == 'rbf_svm':
            from sklearn.svm import SVC
            estimator = SVC(kernel='rbf', probability=True, cache_size=1000)
            param_dist = {
                'C': tuning.UniformLogSpace(base=10, lo=-2, hi=1),
                'gamma': tuning.UniformLogSpace(base=10, lo=-5, hi=1)
            }
        elif method == 'gbdt':
            from xgboost import XGBClassifier
            estimator = XGBClassifier(objective='multi:softprob')
            param_dist = {
                'max_depth': tuning.UniformIntegerLogSpace(base=2, lo=0, hi=5),
                'n_estimators': tuning.UniformIntegerLogSpace(base=2,
                                                              lo=4,
                                                              hi=8),
                'learning_rate': tuning.UniformLogSpace(base=10, lo=-3, hi=0)
            }
        else:
            raise ValueError('unknown method: {}'.format(method))

        param_search = RandomizedSearchCV(estimator,
                                          param_dist,
                                          refit=False,
                                          n_iter=num_search,
                                          scoring=loss_scorer)
        param_search.fit(x_val, y_val)

        best_param = param_search.best_params_

    print('Best parameters for {} for k_idx={}: {} found in {:.3f} s'.format(
        method, k_idx, best_param,
        time.time() - start))

    return best_param
Beispiel #5
0
def run(ModelClass, x_unvec, y, idx_feat_dict, num_feature, max_num_feature,
        num_class, feature_selection, k_idx, k, params, perm_indices,
        init_args, full_out_dir):
    """Run a classification pipeline for a single k-fold partition.

    Arguments:
        ModelClass: Python class
            classification model
        x_unvec: [[int]]
            feature indices that have not been vectorized; each inner list
            collects the indices of features that are present (binary on)
            for a sample
        y: [int]
            list of class labels as integer indices
        idx_feat_dict: {int: string}
            dictionary mapping feature indices to features
        num_feature: int
            number of features present in the dataset
        max_num_feature: int
            maximum number of features to use
        num_class: int
            number of classes present
        feature_selection: string
            feature selection method; values = {'random', 'frequency', 'chi2'}
        k_idx: int
            index of the k-fold partition to use
        k: int
            number of partitions for k-fold cross-validation
        params: [{string: ?}]
            list of dictionary mapping parameter names to values for each
            k-fold partition
        perm_indices: np.ndarray, int
            array of indices representing a permutation of the samples with
            shape (num_sample, )
        init_args: {string: ?}
            dictionary mapping initialization argument names to values
        out_dir: string
            directory where outputs (e.g., results) should be saved
    """
    print('-' * 72)
    print('Partition k = {}'.format(k_idx))
    print(params[k_idx])

    x_train_unvec, y_train, _, _, x_test_unvec, y_test = (
        emr.get_k_fold_partition(x_unvec,
                                 y,
                                 k_idx=k_idx,
                                 k=k,
                                 perm_indices=perm_indices))

    if max_num_feature > 0:  # select features and re-encode
        feat_encoding_dict, _ = select_features(
            x_train_unvec,
            y_train,
            idx_feat_dict,
            method=feature_selection,
            num_feature=num_feature,
            max_num_feature=max_num_feature)
        x_train_unvec = subset_reencode_features(x_train_unvec,
                                                 feat_encoding_dict)
        x_test_unvec = subset_reencode_features(x_test_unvec,
                                                feat_encoding_dict)
        num_feature = max_num_feature

    x_train = vectorize_features(x_train_unvec, num_feature)
    x_test = vectorize_features(x_test_unvec, num_feature)

    args = dict(init_args)  # copy dictionary
    args.update(params[k_idx])

    start = time.time()
    model = ModelClass(**args)
    model.fit(x_train, y_train)
    y_test_probas = model.predict_proba(x_test)
    runtime = time.time() - start

    evaluate(y_test,
             y_test_probas,
             runtime,
             num_class=num_class,
             out_dir=full_out_dir)
Beispiel #6
0
def run(x_unvec, y, idx_feat_dict, idx_class_dict, icd9_descript_dict,
        num_feature, max_num_feature, num_class, feature_selection, k_idx, k,
        params, perm_indices, full_out_dir):
    """Run a RIDDLE classification pipeline for a single k-fold partition.

    Arguments:
        x_unvec: [[int]]
            feature indices that have not been vectorized; each inner list
            collects the indices of features that are present (binary on)
            for a sample
        y: [int]
            list of class labels as integer indices
        idx_feat_dict: {int: string}
            dictionary mapping feature indices to features
        idx_class_dict: {int: string}
            dictionary mapping class indices to classes
        icd9_descript_dict: {string: string}
            dictionary mapping ICD9 codes to description text
        num_feature: int
            number of features present in the dataset
        max_num_feature: int
            maximum number of features to use
        num_class: int
            number of classes
        feature_selection: string
            feature selection method; values = {'random', 'frequency', 'chi2'}
        k_idx: int
            index of the k-fold partition to use
        k: int
            number of partitions for k-fold cross-validation
        params: [{string: ?}]
            list of dictionary mapping parameter names to values for each
            k-fold partition
        perm_indices: np.ndarray, int
            array of indices representing a permutation of the samples with
            shape (num_sample, )
        full_out_dir: string
            directory where outputs (e.g., results) should be saved
    """
    from keras import backend as K
    from riddle import emr, feature_importance
    from riddle.models import MLP

    print('Partition k = {}'.format(k_idx))
    print()
    x_train_unvec, y_train, x_val_unvec, y_val, x_test_unvec, y_test = (
        emr.get_k_fold_partition(x_unvec, y, k_idx=k_idx, k=k,
                                 perm_indices=perm_indices))

    if max_num_feature > 0:  # select features and re-encode
        feat_encoding_dict, idx_feat_dict = select_features(
            x_train_unvec, y_train, idx_feat_dict,
            method=feature_selection, num_feature=num_feature,
            max_num_feature=max_num_feature)
        x_train_unvec = subset_reencode_features(x_train_unvec,
                                                 feat_encoding_dict)
        x_val_unvec = subset_reencode_features(x_val_unvec,
                                               feat_encoding_dict)
        x_test_unvec = subset_reencode_features(x_test_unvec,
                                                feat_encoding_dict)
        num_feature = max_num_feature

    # set up
    max_num_epoch = -1
    if 'debug' in full_out_dir:
        max_num_epoch = 3
    model = MLP(num_feature=num_feature, num_class=num_class,
                max_num_epoch=max_num_epoch, **params[k_idx])

    # train and test
    start = time.time()

    model.train(x_train_unvec, y_train, x_val_unvec, y_val)
    y_test_probas = model.predict_proba(x_test_unvec)

    runtime = time.time() - start
    print('Completed training and testing in {:.4f} seconds'.format(runtime))
    print('-' * 72)
    print()

    # evaluate model performance
    evaluate(y_test, y_test_probas, runtime, num_class=num_class,
             out_dir=full_out_dir)

    model.save_model(path=full_out_dir + '/model.h5')
    K.clear_session()

    print('Finished with partition k = {}'.format(k_idx))
    print('=' * 72)
    print()
Beispiel #7
0
def main():
    """
    Module to execute the entire package from data retrieval to model
    performance metrics
    @:param: None
    :return: Post process results
    """
    # Importing inhibitor notation data
    # The SMILES and InChI logs of the same material have identical indices
    # Creating and joining the SMILES and InChI dataframes along the same index

    utils.check_files()
    df_compounds_smiles = utils.create_dataframe(
        'data/chemical_notation_'
        'data/compounds_smiles.txt', 'smiles')
    df_compounds_smiles.rename(columns={'ID': 'CID'}, inplace=True)
    df_compounds_smiles.sort_values(by='CID', inplace=True)

    # Importing inhibitor activity data
    activity = pd.read_csv('data/activity_data/AID_743255_datatable.csv')
    activity = utils.clean_activity_dataframe(activity)

    # Merging activity data and compound notation data
    df = activity.merge(df_compounds_smiles)
    df.sort_values(by='CID', inplace=True)
    df.reset_index(drop=True, inplace=True)

    # Drop non-descriptor columns before feature space reduction
    df_target = df.drop(['SMILES', 'CID', 'Phenotype'], axis=1)

    # Extracting molecular descriptors for all compounds
    # print("Sending data for descriptor calculation")
    # utils.extract_all_descriptors(df, 'SMILES')

    # Importing feature sets
    df_charge = pd.DataFrame.from_csv('data/df_charge.csv')
    df_basak = pd.DataFrame.from_csv('data/df_basak.csv')
    df_con = pd.DataFrame.from_csv('data/df_con.csv')
    df_estate = pd.DataFrame.from_csv('data/df_estate.csv')
    df_constitution = pd.DataFrame.from_csv('data/df_constitution.csv')
    df_property = pd.DataFrame.from_csv('data/df_property.csv')
    df_kappa = pd.DataFrame.from_csv('data/df_kappa.csv')
    df_moe = pd.DataFrame.from_csv('data/df_moe.csv')

    print("Joining dataframes")
    df_descriptor = df_kappa.join(df_moe).join(df_constitution).\
        join(df_property).join(df_charge).join(df_estate).join(df_con).join(
        df_basak)
    print("Joining dataframes done")

    print("Checking dataframe for NaN, infinite or too large values")
    df_descriptor = utils.remove_nan_infinite(df_descriptor)

    # Transform all column values to mean 0 and unit variance
    print("Transforming dataframe using mean and variance")
    df_descriptor = utils.transform_dataframe(df_descriptor)
    print("Transforming dataframe using mean and variance done")

    # Feature selection and space reduction
    print("Selecting best features in dataframe")
    df_features = utils.select_features(df_descriptor, df_target)
    print("Selecting best features in dataframe done")

    df = df_features.join(df_target)

    # Data to training task
    # Type check inputs for sanity
    if df is None:
        raise ValueError('df is None')
    if not isinstance(df, pd.DataFrame):
        raise TypeError('df is not a dataframe')
    if TARGET_COLUMN is None:
        raise ValueError('target_column is None')
    if not isinstance(TARGET_COLUMN, basestring):
        raise TypeError('target_column is not a string')
    if TARGET_COLUMN not in df.columns:
        raise ValueError('target_column (%s) is not a valid column name' %
                         TARGET_COLUMN)

    # Train, validation and test split
    df_train, df_test = sklearn.cross_validation.train_test_split(
        df, test_size=0.25)

    # Remove the classification column from the dataframe
    x_train = df_train.drop(TARGET_COLUMN, 1)
    x_test = df_test.drop(TARGET_COLUMN, 1)
    y_train = pd.DataFrame(df_train[TARGET_COLUMN])
    y_test = pd.DataFrame(df_test[TARGET_COLUMN])

    with open(XY_PICKLE, 'wb') as results:
        pickle.dump(x_train, results, pickle.HIGHEST_PROTOCOL)
        pickle.dump(x_test, results, pickle.HIGHEST_PROTOCOL)
        pickle.dump(y_train, results, pickle.HIGHEST_PROTOCOL)
        pickle.dump(y_test, results, pickle.HIGHEST_PROTOCOL)

    models.run_models(x_train, y_train, x_test, y_test)

    post_process.results()