コード例 #1
0
def prepare_features(config):
    """load and prepare the features, either conventional or spectral"""

    if config['features']['type'] == 'conventional':
        features_path = os.path.join(config['data_path'], 'conventional_features')
        if config['features']['descriptor'] == 'all':
            all_features = defaultdict(lambda: np.array([]))
            feature_files = glob.glob(os.path.join(features_path, '*.csv'))
            for feature_file in feature_files:
                if 'haddad' in feature_file or 'saito' in feature_file:
                    continue
                features = flib.read_feature_csv(feature_file)
                for key, value in features.items():
                    all_features[key] = np.hstack((all_features[key], value))
            features = all_features
            max_len = max([len(val) for val in features.values()])
            for key in list(features.keys()):
                if len(features[key]) < max_len:
                    del features[key]
                    print 'deleted a molecule because not all features available'
        else:
            feature_file = os.path.join(config['data_path'], 'conventional_features',
                                        config['features']['descriptor'] + '.csv')
            features = flib.read_feature_csv(feature_file)
    elif config['features']['type'] == 'spectral':
        feature_file = os.path.join(config['data_path'], 'spectral_features',
                                    'large_base', 'parsed.pckl')
        spectra = pickle.load(open(feature_file))
        features = flib.get_spectral_features(spectra,
                                              spec_type=config['features']['spec_type'],
                                              use_intensity=config['features']['use_intensity'],
                                              kernel_widths=config['features']['kernel_width'],
                                              bin_width=config['features']['bin_width'])
    features = flib.remove_invalid_features(features)
    if config['features']['properties_to_add']:
        flib.add_molecule_properties(features, config['features']['properties_to_add'])
    if config['features']['normalize']:
        features = flib.normalize_features(features)
    return features
molid = '1'
outpath = '/Users/dedan/projects/master/results/visualization/'
data_path = os.path.join(os.path.dirname(__file__), '..', '..', 'data')
feature_file = os.path.join(data_path, 'spectral_features', 'large_base', 'parsed.pckl')
spectra = pickle.load(open(feature_file))

fig = plt.figure()
ax = fig.add_subplot(211)
ax.hist(spectra[molid]['freq'], bins=4000, edgecolor='0.4', facecolor='0.4')
ax.set_xlim([0, 4000])
ax.set_xticks([0, 4000])
ax.set_xticklabels([0, 4000])
ax.set_yticks(ax.get_ylim())
utils.simple_axis(ax)

print('loading..')
features = fl.get_spectral_features(spectra,
                                    use_intensity=False,
                                    kernel_widths=k_width,
                                    bin_width=10)
print('plotting..')
ax = fig.add_subplot(212)
ax.plot(features[molid], color='0.4', label='sigma: 20')
ax.set_xticks([ax.get_xticks()[0], ax.get_xticks()[-1]])
ax.set_xticklabels([0, 4000])
utils.simple_axis(ax)
ax.set_yticks(ax.get_ylim())
ax.set_xlabel('wavenumber (1/cm)')
ax.legend(loc='upper left', frameon=False, numpoints=1)
fig.savefig(os.path.join(outpath, 'eva.png'), dpi=300)
        print glom
        glom_idx = glomeruli.index(glom)

        # select molecules available for the glomerulus
        targets , tmp_cas_numbers = rdl.get_avail_targets_for_glom(rm, cas_numbers, glom_idx)
        molids = [door2id[cas_number][0] for cas_number in tmp_cas_numbers]

        # for some of them the spectra are not available
        targets = [targets[i] for i in range(len(tmp_cas_numbers)) if str(molids[i]) in spectra]
        molids = [m for m in molids if str(m) in spectra]

        res[glom] = {'data': {}, 'regression': {}, 'forest': {}, 'oob': {},
                     'targets': targets, 'oob_prediction': {}, 'oob_sel': {}}
        for i, kernel_width in enumerate(kernel_widths):

            data = features_lib.get_spectral_features(spectra, molids, resolution, kernel_width=kernel_width)

            # univariate test
            _, p = f_regression(data, targets)
            res[glom]['regression'][kernel_width] = -np.log10(p)

            # random forest regression
            rfr = RandomForestRegressor(n_estimators=n_estimators,
                                        compute_importances=True,
                                        oob_score=True)

            # TODO: feature selection step
            rfr.fit(data,targets)
            res[glom]['forest'][kernel_width] = rfr.feature_importances_
            res[glom]['oob'][kernel_width] = rfr.oob_score_
            res[glom]['oob_prediction'][kernel_width] = rfr.oob_prediction_