コード例 #1
0
 def test_normalization(self):
     """normalize to zero mean and unit variance"""
     unnormed_features = flib.remove_invalid_features(self.features)
     features = flib.normalize_features(unnormed_features)
     feature_mat = np.array(self.features.values())
     for i in range(feature_mat.shape[1]):
         self.assertAlmostEqual(np.var(feature_mat[:, i]), 1.0, 5)
         self.assertAlmostEqual(np.mean(feature_mat[:, i]), 0, 5)
コード例 #2
0
def prepare_features(config):
    """load and prepare the features, either conventional or spectral"""

    if config['features']['type'] == 'conventional':
        features_path = os.path.join(config['data_path'], 'conventional_features')
        if config['features']['descriptor'] == 'all':
            all_features = defaultdict(lambda: np.array([]))
            feature_files = glob.glob(os.path.join(features_path, '*.csv'))
            for feature_file in feature_files:
                if 'haddad' in feature_file or 'saito' in feature_file:
                    continue
                features = flib.read_feature_csv(feature_file)
                for key, value in features.items():
                    all_features[key] = np.hstack((all_features[key], value))
            features = all_features
            max_len = max([len(val) for val in features.values()])
            for key in list(features.keys()):
                if len(features[key]) < max_len:
                    del features[key]
                    print 'deleted a molecule because not all features available'
        else:
            feature_file = os.path.join(config['data_path'], 'conventional_features',
                                        config['features']['descriptor'] + '.csv')
            features = flib.read_feature_csv(feature_file)
    elif config['features']['type'] == 'spectral':
        feature_file = os.path.join(config['data_path'], 'spectral_features',
                                    'large_base', 'parsed.pckl')
        spectra = pickle.load(open(feature_file))
        features = flib.get_spectral_features(spectra,
                                              spec_type=config['features']['spec_type'],
                                              use_intensity=config['features']['use_intensity'],
                                              kernel_widths=config['features']['kernel_width'],
                                              bin_width=config['features']['bin_width'])
    features = flib.remove_invalid_features(features)
    if config['features']['properties_to_add']:
        flib.add_molecule_properties(features, config['features']['properties_to_add'])
    if config['features']['normalize']:
        features = flib.normalize_features(features)
    return features
コード例 #3
0
 def test_invalid_column_removed(self):
     """features with zero variance should be removed"""
     self.assertEqual(np.array(self.features.values()).shape[1], 8)
     features = flib.remove_invalid_features(self.features)
     self.assertEqual(np.array(self.features.values()).shape[1], 6)