reload(run_lib)

plt.close('all')
n_folds_list = [5, 10, 20, 50]
n_repetitions = 5
method = 'svr'
out_path = '/Users/dedan/projects/master/results/validation/gen_score_svr'

base_path = os.path.join(os.path.dirname(__file__), '..')
config = json.load(open(os.path.join(base_path, 'config', 'validate_genscore_svr.json')))
config['data_path'] = os.path.join(base_path, 'data')

# load the features
features = run_lib.prepare_features(config)

used_glomeruli = json.load(open(os.path.join(config['data_path'], 'used_glomeruli.json')))
res = {g: {nf: [] for nf in n_folds_list} for g in used_glomeruli}
for glom in used_glomeruli:

    print(glom)
    config['glomerulus'] = glom
    data, targets, molids = run_lib.load_data_targets(config, features)
    config['feature_selection']['k_best'] = data.shape[1]

    for i, n_folds in enumerate(n_folds_list):
        print(n_folds)
        config['methods'][method]['n_folds'] = n_folds
        for j in range(n_repetitions):
            run_res = run_lib.run_runner(config, data, targets)
            res[glom][n_folds].append(run_res[method]['gen_score'])
json.dump(open(os.path.join(out_path, 'res.json')))
cache = {}
for k, config in feat_config.items():
    base_config['features'].update(config)
    cache[k] = {"features": rl.prepare_features(base_config)}
all_mols = [r['features'].keys() for r in cache.values()]
mol_intersection = set(all_mols[0]).intersection(*all_mols[1:])

res = {n: {g: {} for g in gloms} for n in feat_config}
for glom in gloms:
    print('{}\n'.format(glom))
    base_config.update({'glomerulus': glom, 'data_path': data_path})

    dtm = {}
    for name, config in feat_config.items():
        base_config['features'].update(config)
        data, targets, molids = rl.load_data_targets(base_config, cache[name]['features'])
        dtm[name] = {
            'data': data,
            'targets': targets,
            'molids': molids
        }

    # select molecules that none of the models will be trained on
    all_trained = set(dtm.values()[0]['molids']).union(*[m['molids'] for m in dtm.values()[1:]])
    to_predict_molids = mol_intersection - all_trained

    for name, data in dtm.items():

        # fit model
        print('working on model: {}'.format(name))
        base_config['feature_selection']['k_best'] = data['data'].shape[1]
            "n_folds": 50
        }
    },
    "randomization_test": False
}

used_gloms = json.load(open(os.path.join(config['data_path'], 'used_glomeruli.json')))

alone_haddad, alone_vib, together = [], [], []
for glom in used_gloms:

    config['glomerulus'] = glom

    # prepare haddad features
    features_h = run_lib.prepare_features(config)
    data_h, targets_h, molids_h = run_lib.load_data_targets(config, features_h)
    config['feature_selection']['k_best'] = data_h.shape[1]
    tmp = run_lib.run_runner(config, data_h, targets_h)
    print glom, tmp
    alone_haddad.append(tmp['svr']['gen_score'])

    # prepare vib100
    config_spec = copy.deepcopy(config)
    config_spec['features']['type'] = 'spectral'
    config_spec['features']['kernel_width'] = 100
    config_spec['features']['bin_width'] = 150
    config_spec['features']['use_intensity'] = False
    config_spec['features']['spec_type'] = 'ir'

    features_v = run_lib.prepare_features(config_spec)
    data_v, targets_v, molids_v = run_lib.load_data_targets(config_spec, features_v)
    },
    "methods": {
        "svr": {
            "cross_val": True,
            "C": 1.0,
            "n_folds": 10
        }
    },
    "data_path": "/Users/dedan/projects/master/data",
    "glomerulus": "Or22a",
    "randomization_test": False
}

# get the molids
spectra = pickle.load(open('/Users/dedan/projects/master/data/spectral_features/large_base/parsed.pckl'))
_, targets, molids = run_lib.load_data_targets(config, {str(i): [] for i in spectra.keys()})

# some molids map to two CAS numbers for some molecules, use only first
first_molids_idx = sorted([molids.index(m) for m in set(molids)])
targets = targets[first_molids_idx]
molids = [molids[i] for i in first_molids_idx]

# place the intensity values in a high-resolution vector
freqs = {k: v for k, v in spectra.items() if k in molids}
data_orig = flib._place_waves_in_vector(freqs, 0.01, True, 'ir')
assert len(molids) == len(targets) == data_orig.shape[0]

# randomization
np.random.seed()
map(np.random.shuffle, data_orig.T)
import os
import json
from master.libs import run_lib
import numpy as np
import pylab as plt

config = {
    "data_path": os.path.join(os.path.dirname(__file__), "..", "data"),
    "features": {"type": "conventional", "descriptor": "all", "normalize": True, "properties_to_add": []},
    "feature_selection": {"method": "linear"},
    "methods": {"svr": {"C": 1.0, "n_folds": 50}},
    "randomization_test": False,
}

used_gloms = json.load(open(os.path.join(config["data_path"], "used_glomeruli.json")))

for glom in used_gloms:

    config["glomerulus"] = glom
    features = run_lib.prepare_features(config)
    data_all, targets, _ = run_lib.load_data_targets(config, features)
    config["features"]["descriptor"] = "saito_desc"
    data_saito, _, _ = run_lib.load_data_targets(config, features)
    np.random.seed()
    # map(np.random.shuffle, data_all.T)
    new_data = np.hstack((data_saito, data_all))
    config["feature_selection"]["k_best"] = data_all.shape[1]
    tmp = run_lib.run_runner(config, data_all, targets, False, False)
    print glom, tmp["svr"]["gen_score"]