Esempio n. 1
0
    def _eliminate_cleaner(self, train_features, train_targets, test_features):
        """Function to remove data missing or useless.

        Parameters
        ----------
        train_features : array
            The array of training features.
        train_targets : array
            A list of training target values.
        test_features : array
            The array of test features.
        """
        train_features = np.array(train_features)
        if test_features is not None:
            test_features = np.array(test_features)

        # Identify clean and informative features.
        finite = clean_infinite(train=train_features,
                                test=test_features,
                                targets=train_targets)
        informative = clean_variance(train=train_features, test=test_features)

        # Join lists of features to keep.
        self.clean_index = np.intersect1d(finite['index'],
                                          informative['index'])

        if test_features is None:
            return train_features[:, self.clean_index], train_targets, \
                test_features

        return train_features[:, self.clean_index], train_targets, \
            test_features[:, self.clean_index]
Esempio n. 2
0
    def _eliminate_cleaner(self, train_features, train_targets, test_features):
        """Function to remove data missing or useless.

        Parameters
        ----------
        train_features : array
            The array of training features.
        train_targets : array
            A list of training target values.
        test_features : array
            The array of test features.
        """
        reshape_targets = False
        if len(np.shape(train_targets)) == 1:
            train_targets = np.reshape(train_targets, (len(train_targets), 1))
            reshape_targets = True

        cleaned = clean_infinite(train=train_features,
                                 test=test_features,
                                 targets=train_targets)

        # Assign cleaned training target data.
        train_targets = cleaned['targets']
        if reshape_targets:
            train_targets = np.reshape(train_targets, (len(train_targets), ))

        # Keep the indexes to delete.
        self.eliminate_index = cleaned['index']

        cleaned = clean_variance(train=cleaned['train'], test=cleaned['test'])

        # Join lists of features to eliminate.
        self.eliminate_index += cleaned['index']

        return cleaned['train'], train_targets, cleaned['test']
    def clean_variance(self, df_train, df_test, labels):
        """
        """
        # | - clean_variance
        verbose = self.verbose
        # #####################################################################

        if verbose:
            print("Cleaning variance:")
            print("train_data.shape:", df_train.shape)

        cleaned_data = clean_variance(
            df_train,
            test=df_test,
            labels=labels,
        )

        df_train = cleaned_data["train"]
        df_test = cleaned_data["test"]
        labels = cleaned_data["labels"]

        if verbose:
            print("df_train.shape:", df_train.shape)
            print("")

        out_dict = {
            "df_train": df_train,
            "df_test": df_test,
            "labels": labels,
        }

        return (out_dict)
Esempio n. 4
0
    def test_variance(self):
        """Test cleaning zero variace features."""
        features = np.random.random_sample((50, 5))
        features[:, 1:2] = 109.982
        test = np.random.random_sample((100, 5))

        informative = clean.clean_variance(features, test=test)

        self.assertTrue(np.shape(informative['train']) == (50, 4))
        self.assertTrue(np.shape(informative['test']) == (100, 4))
Esempio n. 5
0
def clean_features(features, scale=False):
    remove_indices = {
        'train': np.array([], dtype=int),
        'test': np.array([], dtype=int)
    }
    for key, feature_set in features.items():
        if feature_set is None or len(feature_set) == 0:
            continue
        bad_structure_indices = \
            np.where(np.isfinite(feature_set).all(axis=1) == False)
        for b in bad_structure_indices:
            if len(np.where(np.isfinite(feature_set[b]) == False)) > 1:
                remove_indices[key] = np.append(remove_indices[key], b)

        features[key] = np.delete(feature_set, remove_indices[key], axis=0)

    if not 'test' in features:
        features['test'] = None

    # Finite features
    features = clean_infinite(features['train'], features['test'])

    # Clean variance
    features = clean_variance(features['train'], features['test'])

    # Clean skewness & standardize
    if features['test'] is None or len(features['test']) == 0:
        features = clean_skewness(features['train'], skewness=3)
        if scale:
            features = standardize(features['train'])

    else:
        features = clean_skewness(features['train'],
                                  features['test'],
                                  skewness=3)
        if scale:
            features = standardize(features['train'], features['test'])

    return features, remove_indices
Esempio n. 6
0
# In[4]:


voro = VoronoiFingerprintGenerator(alist)
data_frame = voro.generate()


# In cases, where the generated featues does not apply to the input data, `NaN`s are returned.
# There are various ways of filling in this kind of data and the simplest is simply to remove features containing infinite values.
# 
# The conventional data format in CatLearn is a matrix, so we first convert the Pandas dataframe into a numpy array.

# In[5]:


matrix = data_frame.to_numpy()
finite_numeric_data = clean_infinite(matrix)
print(np.shape(finite_numeric_data['train']))


# Furthermore, you might have data sets where certain features have completely the same value. Use `clean_variance` to get rid of those meaningless features.

# In[6]:


useful_data = clean_variance(finite_numeric_data['train'])
print(np.shape(useful_data['train']))


# We only selected the first 10 data points in this example, so there are likely to be some invariant features across those 10.
Esempio n. 7
0
    r.site = site

syss = reactionlist

# Generate the fingreprints for all systems.
trainfingers = np.asarray([return_features(sys) for sys in syss])
train_name = np.asarray([sys['surface'] + ' ' + sys['a'] for sys in syss])

# Get the target values for training and test.
for prop in ['reaction_energy']:
    trainval = [float(getattr(sys, prop)) for sys in syss]

# Clean up the data
data_dict0 = clean_infinite(trainfingers)
print(data_dict0['index'])
data_dict1 = clean_variance(data_dict0['train'])
print(data_dict1['index'])
allfingers = data_dict1['train']

# Save all the data.
print('Saving', np.shape(allfingers), 'all data matrix.')
np.save(file=data_path + 'catapp_features.npy',
        arr=allfingers,
        allow_pickle=True,
        fix_imports=True)
np.save(file=data_path + 'catapp_targets.npy',
        arr=trainval,
        allow_pickle=True,
        fix_imports=True)

np.save(file=data_path + 'catapp_name.npy',