Beispiel #1
0
def data_standardization(train, test):
    """
    Scales descriptors to zero mean and unit variance
    Parameters:
        file: pandas.DataFrame
            Descriptor and Target Data
    Returns:
        scaled_file: pandas.DataFrame
            Scaled Descriptor and Target Data
    """
    train_descriptors, train_target = utils.descriptor_target_split(train)
    test_descriptors, test_target = utils.descriptor_target_split(test)
    columns = train_descriptors.columns
    scalar = StandardScaler().fit(train_descriptors)

    scaled_train_descriptors = scalar.transform(train_descriptors)
    scaled_train_descriptors = pd.DataFrame(scaled_train_descriptors)
    scaled_train_descriptors.columns = columns

    scaled_test_descriptors = scalar.transform(test_descriptors)
    scaled_test_descriptors = pd.DataFrame(scaled_test_descriptors)
    scaled_test_descriptors.columns = columns

    scaled_train = utils.descriptor_target_join(scaled_train_descriptors,
                                                train_target)
    scaled_test = utils.descriptor_target_join(scaled_test_descriptors,
                                               train_target)

    return scaled_train, scaled_test
Beispiel #2
0
def univariate_feature_selection(file,
                                 k_value=10,
                                 score_function="f_regression"):
    """
    Univariate feature selection works by selecting the best features based on univariate statistical tests.
    Selects features according to the k highest scores.
        Parameters:
            file: pandas.DataFrame
                Input DataFrame to perform univariate feature selection.
            k_value: int, optional, default=10
                Number of top features to select.
            score_function: string, optional, default="f_regression"
                Scoring function that return scores and pvalues. It must be one of "f_regression" or "mutual_info_regression".
                If none is given, "f_regression" is used
        Returns:
            file: {array-like, sparse matrix}
                Transformed array.
    """
    if score_function == "f_regression":
        from sklearn.feature_selection import f_regression
        selector = SelectKBest(f_regression, k_value)
    elif score_function == "mutual_info_regression":
        from sklearn.feature_selection import mutual_info_regression
        selector = SelectKBest(mutual_info_regression, k_value)
    descriptors, target = utils.descriptor_target_split(file)
    column_list = list(descriptors.columns)
    transformed_arrays = selector.fit_transform(descriptors,
                                                target.values.ravel())
    transformed_columns_list = [
        column_list[i] for i in selector.get_support(indices=True)
    ]
    file = pd.DataFrame(transformed_arrays, columns=transformed_columns_list)
    file = utils.descriptor_target_join(file, target)
    return file
Beispiel #3
0
def getAllDescriptors(data):
    smiles, target = utils.descriptor_target_split(data)
    cols = _topology + _constitutional + _bcut + _basak + _cats2d + _charge + _connectivity + _estate + _geary + _kappa + _moe + _moran + _moreaubroto
    AllDescriptors = pd.DataFrame(columns=cols)
    print('\nCalculating Molecular Descriptors...')
    for i in range(len(smiles)):
        print('Row %d out of %d' % (i + 1, len(smiles)), end='')
        print('\r', end='')
        AllDescriptors.loc[i] = getAllDescriptorsforMol(
            Chem.MolFromSmiles(smiles['SMILES'][i]))
    final_df = utils.descriptor_target_join(AllDescriptors, target)
    print('\nCalculating Molecular Descriptors Completed.')
    return final_df
Beispiel #4
0
def getDescriptors(data, descriptor_type='topology'):
    smiles, target = utils.descriptor_target_split(data)
    cols = descriptor_list[descriptor_type]
    AllDescriptors = pd.DataFrame(columns=cols)
    print('\nCalculating %s descriptors...' % descriptor_type)
    for i in range(len(smiles)):
        print('Row %d out of %d' % (i + 1, len(smiles)), end='')
        print('\r', end='')
        AllDescriptors.loc[i] = descriptor_fn[descriptor_type](
            Chem.MolFromSmiles(smiles['SMILES'][i]))
    final_df = utils.descriptor_target_join(AllDescriptors, target)
    print('\nCalculating %s descriptors completed.' % descriptor_type)
    return final_df
Beispiel #5
0
def tree_based_feature_selection(file,
                                 n_estimators_value=10,
                                 max_features_value=None,
                                 threshold_value="mean"):
    """
    Feature selection using a tree-based estimator to compute feature importances, which in turn can be used
    to discard irrelevant features
        Parameters:
            file: pandas.DataFrame
                Input DataFrame to perform tree based feature selection.
            n_estimators_value: int, optional, default=10
                Number of trees in the forest.
            max_features_value: {int, float, string}, optional, default=None
                The number of features to consider when looking for the best split.
                If int, then consider max_features_value features at each split.
                If float, then max_features_value is a percentage and int(max_features_value*n_features) features are
                considered at each split.
                If "auto", then max_features_value=sqrt(n_features)
                If "sqrt", then max_features_value=sqrt(n_features)
                If "log2", then max_features_value=log2(n_features)
                If None, then max_features_value=n_features
            threshold_value: {int, string}, optional, default="mean"
                The threshold value to use for feature selection. Features whose importance is greater or equal are kept while
                the others are discarded. It must be one of "1.25*mean", "median", "1e-5" or "0.001". If none is given,
                "mean" is used
        Returns:
            file: {array-like, sparse matrix}
                Transformed array.
    """
    descriptors, target = utils.descriptor_target_split(file)
    column_list = list(descriptors.columns)
    clf = ExtraTreesRegressor(n_estimators=n_estimators_value,
                              max_features=max_features_value)
    clf = clf.fit(descriptors, target)
    model = SelectFromModel(clf, prefit=True, threshold=threshold_value)
    transformed_arrays = model.transform(descriptors)
    transformed_columns_list = [
        column_list[i] for i in model.get_support(indices=True)
    ]
    file = pd.DataFrame(transformed_arrays, columns=transformed_columns_list)
    file = utils.descriptor_target_join(file, target)
    return file
Beispiel #6
0
def remove_low_variance_features(file, threshold_value=0.01):
    """
    Feature selector that removes all low-variance features.
        Parameters:
            file: pandas.DataFrame
                Input Data from which to compute variances.
            threshold_value : float, optional
                Features with a training-set variance lower than this threshold will be removed.
        Returns:
            file: {array-like, sparse matrix}
                Transformed array.
    """
    descriptors, target = utils.descriptor_target_split(file)
    column_list = list(descriptors.columns)
    selector = VarianceThreshold(threshold_value)
    transformed_arrays = selector.fit_transform(descriptors)
    transformed_columns_list = [
        column_list[i] for i in selector.get_support(indices=True)
    ]
    descriptors = pd.DataFrame(transformed_arrays,
                               columns=transformed_columns_list)
    file = utils.descriptor_target_join(descriptors, target)
    return file
Beispiel #7
0
def getAllDescriptors(data):
    smiles, target = utils.descriptor_target_split(data)
    cols = _topology + _constitutional + _bcut + _basak + _cats2d + _charge + _connectivity + _estate + _geary + _kappa + _moe + _moran + _moreaubroto
    AllDescriptors = pd.DataFrame(columns=cols)
    ignore = [
        'Ta', 'Nb', 'Os', 'Y', 'Ir', 'Re', 'Ba', 'Ac', 'Ti', 'U', 'V', 'Hf',
        'La', 'Nd', 'Eu', 'Dy', 'Ce', 'Sm', 'Pd', 'Zr', 'Ru', 'W', 'Rh', 'Er',
        'Th'
    ]
    print('\nCalculating Molecular Descriptors...')
    for i in range(0, len(data)):
        break_counter = 0
        for j in ignore:
            if j in smiles['SMILES'][i]:
                break_counter = 1

        if break_counter == 0:
            print('Row %d out of %d' % (i + 1, len(smiles)), end='')
            print('\r', end='')
            AllDescriptors.loc[i] = getAllDescriptorsforMol(
                Chem.MolFromSmiles(smiles['SMILES'][i]))
    final_df = utils.descriptor_target_join(AllDescriptors, target)
    print('\nCalculating Molecular Descriptors Completed.')
    return final_df