def test_drop_duplicates(): cols = ['formula', 'target'] formulae = ['NaCl', 'Al2O3', 'NaCl'] df = pd.DataFrame(columns=cols) df['formula'] = formulae df['target'] = range(len(formulae)) output = cmp.generate_features(df, drop_duplicates=False) out0, out1, out2, out3 = output assert out0.shape[0] == len(formulae) output = cmp.generate_features(df, drop_duplicates=True) out0, out1, out2, out3 = output assert out0.shape[0] == len(formulae) - 1
def test_sum_feat(): cols = ['formula', 'target'] formulae = ['NaCl', 'Al2O3', 'SiO2'] df = pd.DataFrame(columns=cols) df['formula'] = formulae df['target'] = range(len(formulae)) output = cmp.generate_features(df, elem_prop='oliynyk', sum_feat=False) out0, out1, out2, out3 = output assert out0.shape[1] == 264 output = cmp.generate_features(df, elem_prop='oliynyk', sum_feat=True) out0, out1, out2, out3 = output assert out0.shape[1] == 308
def test_outputs(): cols = ['formula', 'target'] formulae = ['NaCl', 'Al2O3', 'NaCl', 'EsNo', 'BaTiO3', 'GaN', 'Am'] targets = np.random.randn((len(formulae))) df = pd.DataFrame(columns=cols) df['formula'] = formulae df['target'] = targets output = cmp.generate_features(df) out0, out1, out2, out3 = output # check returns are the correct variable type assert (isinstance(out0, pd.core.frame.DataFrame) and isinstance(out1, pd.core.series.Series) and isinstance(out2, pd.core.series.Series) and isinstance(out3, list)) # check returned targets are equal to originally specified assert np.allclose(out1, targets, rtol=1e-6, atol=1e-10) # check returned formulae are equal to originally specified assert np.all(out2.values == formulae) # check exotic elements are skipped assert set(out3) == set(['EsNo', 'Am'])
def test_mini(): cols = ['formula', 'target'] formulae = ['NaCl', 'Al2O3', 'SiO2'] df = pd.DataFrame(columns=cols) df['formula'] = formulae df['target'] = range(len(formulae)) output = cmp.generate_features(df, elem_prop='oliynyk', mini=False) out0, out1, out2, out3 = output orig_feats = out0.shape[-1] output = cmp.generate_features(df, elem_prop='oliynyk', mini=True) out0, out1, out2, out3 = output new_feats = out0.shape[-1] assert new_feats < orig_feats
def test_nans(): cols = ['formula', 'target'] formulae = ['NaN'] df = pd.DataFrame(columns=cols) df['formula'] = formulae df['target'] = range(len(formulae)) output = cmp.generate_features(df) out0, out1, out2, out3 = output assert out0.shape[0] != 0 and out0.shape[1] != 0
def test_extend_features(): cols = ['formula', 'target', 'extra_feature1', 'extra_feature2'] formulae = ['NaCl', 'Al2O3', 'SiO2'] df = pd.DataFrame(columns=cols) df['formula'] = formulae df['target'] = range(len(formulae)) df['extra_feature1'] = df['target'] + 0.5 df['extra_feature2'] = df['target'] + 1.5 output = cmp.generate_features(df, extend_features=True) out0, out1, out2, out3 = output assert 'extra_feature1' in out0.columns and 'extra_feature2' in out0.columns
def test_all_elem_props(): cols = ['formula', 'target', 'extra_feature1', 'extra_feature2'] formulae = ['C', 'B', 'F', 'V', 'NaCl', 'Al2O3', 'SiO2'] df = pd.DataFrame(columns=cols) df['formula'] = formulae df['target'] = range(len(formulae)) df['extra_feature1'] = df['target'] + 0.5 df['extra_feature2'] = df['target'] + 1.5 for elem_prop in ELEM_PROPS: output = cmp.generate_features(df, elem_prop=elem_prop) out0, out1, out2, out3 = output
# -*- coding: utf-8 -*- """ Created on Thu Apr 9 10:07:00 2020 @author: Steven Kauwe """ import pandas as pd from time import sleep from cbfv import composition df = pd.read_csv('test_data_extended_feats.csv') print('Featurizing DataFrame without extended features') sleep(1) output = composition.generate_features(df) X_cbfv = output[0] print('Featurizing DataFrame with extended features') sleep(1) output = composition.generate_features(df, extend_features=True) X_extended = output[0]